diff --git a/Project.toml b/Project.toml
index ce13729bb..44c0dda63 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,14 +26,19 @@ MathOptSetDistances = "0.1"
julia = "1"
[extras]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6"
Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
-JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
OSQP = "ab2f91bb-94b4-55e3-9ba0-7f65df51de79"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
-test = ["Test", "Clp", "DelimitedFiles", "SCS", "OSQP", "GLPK", "Ipopt", "JuMP"]
+test = ["Test", "Clp", "DelimitedFiles", "SCS", "OSQP", "GLPK", "Ipopt", "CSV", "Statistics", "DataFrames", "Flux", "Printf", "HTTP"]
diff --git a/README.md b/README.md
index 345b89013..ed805bbb6 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# DiffOpt.jl
-[](https://jump.dev/DiffOpt.jl/stable)
+
[](https://jump.dev/DiffOpt.jl/dev)
[](https://github.com/jump-dev/DiffOpt.jl/actions?query=workflow%3ACI)
[](https://codecov.io/gh/jump-dev/DiffOpt.jl)
@@ -65,7 +65,8 @@ then one can compute gradients by providing perturbations
grads = backward(diff, dA, db, dc)
```
+
diff --git a/docs/Project.toml b/docs/Project.toml
index 5a2d097f4..bbd3a0b75 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -2,7 +2,16 @@
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+MatrixOptInterface = "2f4eb8e6-3e35-4ae4-8c7a-f3d7d9bf20ed"
+OSQP = "ab2f91bb-94b4-55e3-9ba0-7f65df51de79"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Dualization = "191a621a-6537-11e9-281d-650236a99e60"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
\ No newline at end of file
diff --git a/docs/make.jl b/docs/make.jl
index c8a5668d4..ca565d9af 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,13 +15,19 @@ makedocs(;
"Usage" => "usage.md",
"Reference" => "reference.md",
"Examples" => [
- "Solving an LP" => "solve-LP.md",
- "Solving a QP" => "solve-QP.md",
"Solving conic with PSD and SOC constraints" => "solve-conic-1.md",
"Differentiating a simple QP by hand" => "matrix-inversion-manual.md",
- "Sensitivity Analysis SVM" => "sensitivity-analysis-svm.md",
+ "Sensitivity Analysis" => [
+ "SVM" => "sensitivity-analysis-svm.md",
+ "Ridge Regression" => "sensitivity-analysis-ridge.md",
+ ],
+ "Hyper-parameter optimization" => "autotuning-ridge.md",
+ "Custom Neural Network Layers" => [
+ "ReLU Layer" => "custom-relu.md",
+ "SVM as a Layer" => "custom-svm.md",
+ ],
"ChainRules integration" => "chainrules_unit.md",
- ],
+ ]
],
strict = true, # See https://github.com/JuliaOpt/JuMP.jl/issues/1576
repo="https://github.com/jump-dev/DiffOpt.jl",
diff --git a/docs/src/autotuning-ridge.md b/docs/src/autotuning-ridge.md
new file mode 100644
index 000000000..afac9aebd
--- /dev/null
+++ b/docs/src/autotuning-ridge.md
@@ -0,0 +1,226 @@
+# Auto-tuning Hyperparameters
+
+This example shows how to learn the hyperparameters in Ridge Regression using a gradient descent routine. Let the problem be modelled as
+
+```math
+\begin{equation}
+\min_{w} \quad \frac{1}{2n} \sum_{i=1}^{n} (y_{i} - w^T x_{i})^2 + \alpha \| w \|_2^2
+\end{equation}
+```
+
+where
+- `x`, `y` are the data points
+- `w` constitutes weights of the regressing line
+- `α` is the only hyperparameter - regularization constant
+
+
+
+```@example 3
+using DiffOpt
+using Statistics
+using OSQP
+using JuMP
+using Plots
+import Random
+using LinearAlgebra
+nothing # hide
+```
+
+
+```@example 3
+"""
+Return the coefficient of determination R2 of the prediction.
+Best possible score is 1.0, it can be negative because the model can be arbitrarily worse
+"""
+function R2(y_true, y_pred)
+ u = sum((y_pred - y_true).^2) # Regression sum of squares
+ v = sum((y_true .- mean(y_true)).^2) # Total sum of squares
+
+ return 1-(u/v)
+end
+
+function create_problem(N, D, noise)
+ w = rand(D)
+ X = rand(N, D)
+
+ # if noise=0, then there is no need of regularization
+ # and alpha=0 wi ll give the best R2 pred score
+ Y = X*w .+ noise*randn(N)
+
+ # test train split
+ l = N ÷ 2
+ return X[1:l, :], X[l+1:N, :], Y[1:l], Y[l+1:N]
+end
+
+X_train, X_test, Y_train, Y_test = create_problem(800, 30, 4);
+```
+
+
+```@example 3
+function fitRidge(X,Y,α)
+ model = Model(() -> diff_optimizer(OSQP.Optimizer))
+
+ N, D = size(X)
+
+ # add variables
+ @variable(model, w[1:D]>=-10)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+
+ @objective(
+ model,
+ Min,
+ sum((Y - X*w).*(Y - X*w))/(2.0*N) + α*(w'w),
+ )
+
+ optimize!(model)
+
+ custom_loss = objective_value(model)
+ return model, w, custom_loss, value.(w)
+end
+nothing # hide
+```
+
+
+
+```@example 3
+αs = [0.0, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 7e-2, 2e-1, 3e-1, .5, .7, 1.0]
+Rs = Float64[]
+mse = Float64[]
+
+for α in αs
+ _, _, _, w_train = fitRidge(X_train, Y_train, α)
+ Y_pred = X_test*w_train
+ push!(Rs, R2(Y_test, Y_pred))
+ push!(mse, sum((Y_pred - Y_test).^2))
+end
+nothing # hide
+```
+
+
+```@example 3
+plot(log.(αs), Rs*10, label="R2 prediction score", xaxis = ("log(α)"))
+```
+
+
+
+
+```@example 3
+plot(log.(αs), mse, label="MSE", xaxis = ("log(α)"))
+```
+
+
+
+
+# Plotting ∂l/∂α
+
+
+```@example 3
+function ∇model(model, X_train, w, ŵ, α)
+ N, D = size(X_train)
+ dw = zeros(D)
+ ∂w_∂α = zeros(D)
+
+ for i in 1:D
+ dw[i] = 1.0 #set
+
+ MOI.set(
+ model,
+ DiffOpt.ForwardInObjective(),
+ MOI.ScalarQuadraticFunction(
+ [MOI.ScalarAffineTerm(0.0, w[i].index)],
+ [MOI.ScalarQuadraticTerm(dw[i]*α, w[i].index, w[i].index)],
+ 0.0
+ )
+ )
+
+ DiffOpt.forward(model) # find grad
+
+ ∂w_∂α[i] = MOI.get(
+ model,
+ DiffOpt.ForwardOutVariablePrimal(),
+ w[i]
+ )
+
+ dw[i] = 0.0 #unset
+ end
+ return sqrt(ŵ'ŵ) + 2α*(ŵ'∂w_∂α) - sum((X_train*∂w_∂α).*(Y_train - X_train*ŵ))/(2*N)
+end
+nothing # hide
+```
+
+
+
+```@example 3
+∂l_∂αs = Float64[]
+N, D = size(X_train)
+
+for α in αs
+ model, w, _, ŵ = fitRidge(X_train, Y_train, α)
+
+ # testing optimality
+ ∂l_∂w = [2*α*ŵ[i] - sum(X_train[:,i].*(Y_train - X_train*ŵ))/N for i in 1:D]
+ @assert norm(∂l_∂w) < 1e-1
+
+ push!(
+ ∂l_∂αs,
+ ∇model(model, X_train, w, ŵ, α)
+ )
+end
+
+plot(αs, ∂l_∂αs, label="∂l/∂α", xaxis = ("α"))
+```
+
+
+
+
+# Gradient Descent
+
+```@example 3
+"""
+ start from initial value of regularization constant
+ do gradient descent on alpha
+ until the MSE keeps on decreasing
+"""
+function descent(α, max_iters=25)
+ prev_mse = 1e7
+ curr_mse = 1e6
+
+ α_s = Float64[]
+ mse = Float64[]
+
+ iter=0
+ while curr_mse-10 < prev_mse && iter < max_iters
+ iter += 1
+ model, w, _, ŵ = fitRidge(X_train, Y_train, α)
+
+ #update
+ ∂α = ∇model(model, X_train, w, ŵ, α)
+
+ α += 0.01*∂α # make it grow real slow
+
+ push!(α_s, α)
+
+ Y_pred = X_test*ŵ
+
+ prev_mse = curr_mse
+ curr_mse = sum((Y_pred - Y_test).^2)
+
+ push!(mse, curr_mse)
+ end
+
+ return α_s, mse
+end
+nothing # hide
+```
+
+
+```@example 3
+ᾱ, msē = descent(1.0);
+nothing # hide
+```
+
+
+```@example 3
+plot(log.(αs), mse, label="MSE", xaxis = ("α"))
+plot!(log.(ᾱ), msē, label="G.D. for α", lw = 2)
+```
diff --git a/docs/src/custom-relu.md b/docs/src/custom-relu.md
new file mode 100644
index 000000000..5749247fe
--- /dev/null
+++ b/docs/src/custom-relu.md
@@ -0,0 +1,136 @@
+# Custom ReLU layer
+
+We demonstrate how DiffOpt can be used to generate a simple neural network unit - the ReLU layer. A nueral network is created using Flux.jl which is trained on the MNIST dataset.
+
+
+```@example 1
+using Statistics
+using DiffOpt
+using Flux
+using Flux: onehotbatch, onecold, crossentropy, throttle
+using Base.Iterators: repeated
+using OSQP
+using JuMP
+using ChainRulesCore
+```
+
+
+```@example 1
+## prepare data
+imgs = Flux.Data.MNIST.images()
+labels = Flux.Data.MNIST.labels();
+
+# Preprocessing
+X = hcat(float.(reshape.(imgs, :))...) #stack all the images
+Y = onehotbatch(labels, 0:9); # just a common way to encode categorical variables
+
+test_X = hcat(float.(reshape.(Flux.Data.MNIST.images(:test), :))...)
+test_Y = onehotbatch(Flux.Data.MNIST.labels(:test), 0:9)
+
+# float64 to float16, to save memory
+X = convert(Array{Float16,2}, X)
+test_X = convert(Array{Float16,2}, test_X)
+
+X = X[:, 1:1000]
+Y = Y[:, 1:1000]
+nothing # hide
+```
+
+
+```@example 1
+"""
+relu method for a Matrix
+"""
+function matrix_relu(y::AbstractMatrix{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T}
+ _x = zeros(T, size(y))
+
+ # model init
+ N = length(y[:, 1])
+ empty!(model)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+ @variable(model, x[1:N] >= zero(T))
+
+ for i in 1:size(y)[2]
+ @objective(
+ model,
+ Min,
+ x'x -2x'y[:, i]
+ )
+ optimize!(model)
+ _x[:, i] = value.(x)
+ end
+ return _x
+end
+nothing # hide
+```
+
+
+
+
+```@example 1
+function ChainRulesCore.rrule(::typeof(matrix_relu), y::AbstractArray{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T}
+
+ pv = matrix_relu(y, model=model)
+
+ function pullback_matrix_relu(dx)
+ x = model[:x]
+ dy = zeros(T, size(dx))
+
+ for i in 1:size(y)[2]
+ MOI.set.(
+ model,
+ DiffOpt.BackwardInVariablePrimal(),
+ x,
+ dx[:, i]
+ )
+
+ DiffOpt.backward(model) # find grad
+
+ # fetch the objective expression
+ obj_exp = MOI.get(model, DiffOpt.BackwardOutObjective())
+
+ dy[:, i] = JuMP.coefficient.(obj_exp, x) # coeff of `x` in -2x'y
+ dy[:, i] = -2 * dy[:, i]
+ end
+
+ return (NO_FIELDS, dy)
+ end
+ return pv, pullback_matrix_relu
+end
+```
+
+## Define the Network
+
+
+```@example 1
+m = Chain(
+ Dense(784, 64),
+ matrix_relu,
+ Dense(64, 10),
+ softmax,
+)
+nothing # hide
+```
+
+
+```@example 1
+custom_loss(x, y) = crossentropy(m(x), y)
+opt = ADAM(); # popular stochastic gradient descent variant
+
+accuracy(x, y) = mean(onecold(m(x)) .== onecold(y)) # cute way to find average of correct guesses
+
+dataset = repeated((X,Y), 20) # repeat the data set, very low accuracy on the orig dataset
+evalcb = () -> @show(custom_loss(X, Y)) # callback to show loss
+nothing # hide
+```
+
+
+Although our custom implementation takes time, it is able to reach similar accuracy as the usual ReLU function implementation.
+
+```@example 1
+Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5));
+
+@show accuracy(X,Y)
+@show accuracy(test_X, test_Y)
+nothing # hide
+```
diff --git a/docs/src/custom-svm.md b/docs/src/custom-svm.md
new file mode 100644
index 000000000..8b693a6e4
--- /dev/null
+++ b/docs/src/custom-svm.md
@@ -0,0 +1,147 @@
+# Custom SVM layer
+
+This example demonstrates implementing an SVM using DiffOpt. This SVM is used as a layer in a neural network trained to predict survivors on the [Kaggle Titanic Dataset](https://www.kaggle.com/c/titanic).
+
+```@example 1
+using Statistics
+using DiffOpt
+using Flux
+using Flux: onecold, binarycrossentropy, throttle, logitcrossentropy, crossentropy
+using Base.Iterators: repeated
+using JuMP
+using SCS
+using CSV
+using DataFrames
+using ChainRulesCore
+using HTTP
+
+const DATA_URL = "https://raw.githubusercontent.com/be-apt/jump-gsoc-2020/master/titanic_preprocessed.csv"
+nothing # hide
+```
+
+
+```@example 1
+labels = NaN; # hack for the SVM
+nothing # hide
+```
+
+```@example 1
+"""
+ SVM as a Flux layer
+"""
+function SVM(X::AbstractMatrix{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T}
+ D, N = size(X)
+
+ Y = vec([y >= 0.5 ? 1 : -1 for y in labels]')
+ # scale class from 0,1 to -1,1. required for SVM
+
+ # model init
+ empty!(model)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+
+ # add variables
+ @variable(model, l[1:N])
+ @variable(model, w[1:D])
+ @variable(model, b)
+
+ @constraint(model, cons, Y.*(X'*w .+ b) + l.-1 ∈ MOI.Nonnegatives(N))
+ @constraint(model, 1.0*l ∈ MOI.Nonnegatives(N));
+
+ @objective(
+ model,
+ Min,
+ sum(l),
+ )
+
+ optimize!(model)
+
+ wv = value.(w)
+ bv = value(b)
+
+ return (X'*wv .+ bv)' #prediction
+end
+nothing # hide
+```
+
+
+
+```@example 1
+function ChainRulesCore.rrule(::typeof(SVM), X::AbstractArray{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T}
+
+ predictions = SVM(X, model=model)
+
+ """
+ model[:w], model[:b] are the weights of this layer
+ they are not updated using backward pass
+ since they can be computed to an accurate degree using a solver
+ """
+ function pullback_SVM(dX)
+ dy = zeros(T, size(dX)) # since w#
+ return (NO_FIELDS, dy)
+ end
+ return predictions, pullback_SVM
+end
+```
+
+
+```@example 1
+function fetchProblem(;split_ratio::Float64)
+ df = CSV.File(HTTP.get(DATA_URL).body) |> DataFrame
+
+ Y = df[:, 2]
+ X = df[!, 3:12]
+ X = Matrix(X)'
+
+ D, N = size(X)
+
+ l = Int(floor(length(Y)*split_ratio))
+ return X[:, 1:l], X[:, l+1:N], Y[1:l]', Y[l+1:N]'
+end
+X_train, X_test, Y_train, Y_test = fetchProblem(split_ratio=0.8)
+D = size(X_train)[1]
+nothing # hide
+```
+
+## Define the NN
+
+
+```@example 1
+m = Chain(
+ Dense(D, 16, relu),
+ Dropout(0.5),
+ SVM
+)
+nothing # hide
+```
+
+
+```@example 1
+custom_loss(x, y) = logitcrossentropy(m(x), y)
+opt = ADAM(); # popular stochastic gradient descent variant
+
+classify(x::Float64) = (x>=0.5) ? 1 : 0
+
+function accuracy(x, y_true)
+ y_pred = classify.(m(x))
+ return sum(y_true .≈ y_pred) / length(y_true)
+end
+
+dataset = repeated((X_train,Y_train), 1) # repeat the data set, very low accuracy on the orig dataset
+evalcb = () -> @show(custom_loss(X_train,Y_train)) # callback to show loss
+nothing # hide
+```
+
+
+
+```@example 1
+labels = Y_train # needed for SVM
+for iter in 1:1
+ Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU
+end
+
+@show accuracy(X_train, Y_train)
+
+labels = Y_test # needed for SVM
+@show accuracy(X_test, Y_test)
+nothing # hide
+```
diff --git a/docs/src/matrix-inversion-manual.md b/docs/src/matrix-inversion-manual.md
index 3e86a1d3e..33f220950 100644
--- a/docs/src/matrix-inversion-manual.md
+++ b/docs/src/matrix-inversion-manual.md
@@ -120,7 +120,7 @@ gradh = tape.gradient(summed_solution, [h_tf])
## Finding Jacobian using MOI, Dualization.jl, LinearAlgebra.jl
-```julia
+```@example 1
using Random
using MathOptInterface
using Dualization
@@ -193,11 +193,5 @@ end
LHS = [4 1 1; 1 2 1; 1 1 0] # of Eqn (6)
RHS = [0; 0; 1] # of Eqn (6)
-pp \ qq # the jacobian
+LHS \ RHS # the required jacobian
```
-
-
- 3-element Array{Float64,1}:
- 0.25
- 0.75
- -1.75
diff --git a/docs/src/sensitivity-analysis-ridge.md b/docs/src/sensitivity-analysis-ridge.md
new file mode 100644
index 000000000..c3f68612f
--- /dev/null
+++ b/docs/src/sensitivity-analysis-ridge.md
@@ -0,0 +1,122 @@
+# Sensitivity Analysis of Ridge Regression using DiffOpt.jl
+
+This example illustrates the sensitivity analysis of data points in a [Ridge Regression](https://en.wikipedia.org/wiki/Ridge_regression) problem. The general form of the problem is given below:
+
+```math
+\begin{split}
+\begin{array} {ll}
+\mbox{minimize} & \sum_{i=1}^{N} (y_{i} - w x_{i} - b)^2 + \alpha (w^2 + b^2) \\
+\end{array}
+\end{split}
+```
+where
+- `w`, `b` are slope and intercept of the regressing line
+- `x`, `y` are the N data points
+- `α` is the regularization constant
+
+## Define and solve the problem
+```@example 2
+import Random
+import OSQP
+import Plots
+using DiffOpt
+using JuMP
+using LinearAlgebra
+```
+
+Construct a set of noisy (guassian) data points around a line.
+```@example 2
+function create_problem(N=100)
+ m = 2*abs(randn())
+ b = rand()
+ X = randn(N)
+ Y = m*X .+ b + 0.8*randn(N)
+
+ return X, Y
+end
+
+X, Y = create_problem();
+nothing # hide
+```
+
+The helper method `fitRidge` defines and solves the corresponding model.
+```@example 2
+function fitRidge(X,Y,alpha=0.1)
+ model = Model(() -> diff_optimizer(OSQP.Optimizer))
+
+ # add variables
+ @variable(model, w)
+ @variable(model, b)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+
+ @objective(
+ model,
+ Min,
+ sum((Y - w*X .- b).*(Y - w*X .- b)) + alpha*(sum(w*w)+sum(b*b)),
+ )
+
+ optimize!(model)
+
+ loss = objective_value(model)
+ return model, w, b, loss, value(w), value(b)
+end
+nothing # hide
+```
+
+Train on the data generated.
+```@example 2
+model, w, b, loss_train, ŵ, b̂ = fitRidge(X, Y)
+nothing # hide
+```
+
+We can visualize the approximating line.
+```@example 2
+p = Plots.scatter(X, Y, label="")
+mi, ma = minimum(X), maximum(X)
+Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="")
+nothing # hide
+```
+
+
+## Differentiate
+Now that we've solved the problem, we can compute the sensitivity of optimal values -- the approximating line components `w`, `b` in this case -- with respect to perturbations in the data points (`x`,`y`).
+```@example 2
+∇ = zero(X)
+
+for i in 1:length(X)
+ MOI.set(
+ model,
+ DiffOpt.ForwardInObjective(),
+ MOI.ScalarQuadraticFunction(
+ [MOI.ScalarAffineTerm(-2(Y[1] + X[1]), w.index)],
+ [MOI.ScalarQuadraticTerm(2X[1], w.index, w.index)],
+ 0.0
+ )
+ )
+
+ DiffOpt.forward(model)
+
+ db = MOI.get(
+ model,
+ DiffOpt.ForwardOutVariablePrimal(),
+ b
+ )
+
+ ∇[i] = db
+end
+normalize!(∇);
+nothing # hide
+```
+
+Visualize point sensitivities with respect to regressing line. Note that the gradients are normalized.
+```@example 2
+p = Plots.scatter(
+ X, Y,
+ color=[x>0 ? :red : :blue for x in ∇],
+ markersize=[25*abs(x) for x in ∇],
+ label=""
+)
+mi, ma = minimum(X), maximum(X)
+Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="")
+nothing # hide
+```
diff --git a/docs/src/sensitivity-analysis-svm-img-1.svg b/docs/src/sensitivity-analysis-svm-img-1.svg
deleted file mode 100644
index b21f60722..000000000
--- a/docs/src/sensitivity-analysis-svm-img-1.svg
+++ /dev/null
@@ -1,214 +0,0 @@
-
-
diff --git a/docs/src/sensitivity-analysis-svm-img-2.svg b/docs/src/sensitivity-analysis-svm-img-2.svg
deleted file mode 100644
index d20435b76..000000000
--- a/docs/src/sensitivity-analysis-svm-img-2.svg
+++ /dev/null
@@ -1,214 +0,0 @@
-
-
diff --git a/docs/src/sensitivity-analysis-svm-img-3.svg b/docs/src/sensitivity-analysis-svm-img-3.svg
deleted file mode 100644
index 92efcf853..000000000
--- a/docs/src/sensitivity-analysis-svm-img-3.svg
+++ /dev/null
@@ -1,214 +0,0 @@
-
-
diff --git a/docs/src/sensitivity-analysis-svm.md b/docs/src/sensitivity-analysis-svm.md
index 20a4d489c..500408ba6 100644
--- a/docs/src/sensitivity-analysis-svm.md
+++ b/docs/src/sensitivity-analysis-svm.md
@@ -23,7 +23,6 @@ Import the libraries.
```@example 1
import Random
-using Test
import SCS
import Plots
using DiffOpt
@@ -37,9 +36,9 @@ Construct separatable, non-trivial data points.
```@example 1
N = 100
D = 2
-Random.seed!(6)
-X = vcat(randn(N, D), randn(N,D) .+ [4.0,1.5]')
-y = append!(ones(N), -ones(N))
+Random.seed!(62)
+X = vcat(randn(N÷2, D), randn(N÷2,D) .+ [4.0,1.5]')
+y = append!(ones(N÷2), -ones(N÷2))
nothing # hide
```
@@ -111,11 +110,8 @@ svm_y = (-bv .- wv[1] * svm_x )/wv[2]
p = Plots.scatter(X[:,1], X[:,2], color = [yi > 0 ? :red : :blue for yi in y], label = "")
Plots.yaxis!(p, (-2, 4.5))
Plots.plot!(p, svm_x, svm_y, label = "loss = $(round(loss, digits=2))", width=3)
-Plots.savefig("svm_separating.svg")
-nothing # hide
```
-
# Experiments
Now that we've solved the SVM, we can compute the sensitivity of optimal values -- the separating hyperplane in our case -- with respect to perturbations of the problem data -- the data points -- using DiffOpt. For illustration, we've explored two questions:
@@ -206,17 +202,13 @@ p2 = Plots.scatter(
)
Plots.yaxis!(p2, (-2, 4.5))
Plots.plot!(p2, svm_x, svm_y, label = "loss = $(round(loss, digits=2))", width=3)
-Plots.savefig("sensitivity2.svg")
-nothing # hide
```
-
-
## Experiment 2: Gradient of hyperplane wrt the data point coordinates
Similar to previous example, construct perturbations in data points coordinates `X`.
-```julia
+```@example 1
∇ = Float64[]
dX = zeros(N, D)
@@ -229,7 +221,7 @@ for Xi in 1:N
model,
DiffOpt.ForwardInConstraint(),
cons,
- MOI.Utilities.vectorize(dX[:,i] .* w[i]),
+ MOI.Utilities.vectorize(dX[:,i] .* MOI.SingleVariable(w[i])),
)
end
@@ -261,8 +253,4 @@ p3 = Plots.scatter(
)
Plots.yaxis!(p3, (-2, 4.5))
Plots.plot!(p3, svm_x, svm_y, label = "loss = $(round(loss, digits=2))", width=3)
-Plots.savefig(p3, "sensitivity3.svg")
-nothing # hide
```
-
-
diff --git a/docs/src/solve-LP.md b/docs/src/solve-LP.md
deleted file mode 100644
index 6f5a44db1..000000000
--- a/docs/src/solve-LP.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# Solving LP primal
-
-
-```julia
-using Random
-using GLPK
-using MathOptInterface
-using Dualization
-
-const MOI = MathOptInterface
-const MOIU = MathOptInterface.Utilities;
-```
-
-
-```julia
-D = 10 # variable dimension
-N = 20; # no of inequality constraints
-```
-
-## create a non-trivial LP problem
-$$\text{min } c^Tx$$
-$$\text{s.t. } Ax \leq b$$
- $$x \geq 0, x \in R^D$$
-
-
-```julia
-s = rand(N)
-s = 2*s.-1
-λ = max.(-s, 0)
-s = max.(s, 0)
-x̂ = rand(D)
-A = rand(N, D)
-b = A*x̂ .+ s
-c = -A'*λ;
-```
-
-
-```julia
-# can feed dual problem to optimizer like this:
-# model = MOI.instantiate(dual_optimizer(GLPK.Optimizer), with_bridge_type=Float64)
-
-model = GLPK.Optimizer()
-x = MOI.add_variables(model, D)
-
-# define objective
-objective_function = MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(c, x), 0.0)
-MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarAffineFunction{Float64}}(), objective_function)
-MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE)
-```
-
-
-```julia
-# will be useful later
-constraint_indices = []
-
-# set constraints
-for i in 1:N
- push!(constraint_indices, MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(A[i,:], x), 0.),MOI.LessThan(b[i])))
-end
-
-for i in 1:D
- push!(constraint_indices, MOI.add_constraint(model,MOI.SingleVariable(x[i]),MOI.GreaterThan(0.)))
-end
-```
-
-
-```julia
-MOI.optimize!(model)
-```
-
-
-```julia
-@assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL]
-```
-
-
-```julia
-x̄ = MOI.get(model, MOI.VariablePrimal(), x); # solution
-```
-
-
-```julia
-@assert abs(c'x̄ - c'x̂) <= 1e-8 # sanity check
-```
-
-## find and solve dual problem
-
-| primal | dual |
-|--------|------|
-$$\text{min } c^Tx$$ | $$\text{max } b^Ty$$
-$$\text{s.t. } Ax \leq b$$ | $$\text{s.t. } A^Ty \geq c$$
- $$x \geq 0$$ | $$y \leq 0$$
-
-- Each primal variable becomes a dual constraint
-- Each primal constraint becomes a dual variable
-
-
-```julia
-joint_object = dualize(model)
-dual_model_like = joint_object.dual_model # this is MOI.ModelLike, not an MOI.AbstractOptimizer; can't call optimizer on it
-primal_dual_map = joint_object.primal_dual_map;
-```
-
-
-```julia
-# copy the dual model objective, constraints, and variables to an optimizer
-dual_model = GLPK.Optimizer()
-MOI.copy_to(dual_model, dual_model_like)
-
-# solve dual
-MOI.optimize!(dual_model);
-```
-
-
-```julia
-# NOTE: You can obtain components of the dual model individually by -
-# dual_objective = dual_model_like.objective # b'y
-# dual_variable_indices = [primal_dual_map.primal_con_dual_var[x][1] for x in constraint_indices]
-# dual_constraint_indices = [primal_dual_map.primal_var_dual_con[i] for i in x];
-
-# ŷ = MOI.get(dm, MOI.VariablePrimal(), dual_variable_indices)
-```
-
-
-```julia
-# check if strong duality holds
-@assert abs(MOI.get(model, MOI.ObjectiveValue()) - MOI.get(dual_model, MOI.ObjectiveValue())) <= 1e-8
-```
-
-## derive and verify KKT conditions
-
-**complimentary slackness**: $$\mu_{i}(A\bar x -b)_i=0,\quad \mu_{j+N} \bar x_j =0 \qquad \text{ where } i=1..N, j = 1..D$$
-
-
-```julia
-is_less_than(set::S) where {S<:MOI.AbstractSet} = false
-is_less_than(set::MOI.LessThan{T}) where T = true
-
-map = primal_dual_map.primal_con_dual_var
-for con_index in keys(map)
- con_value = MOI.get(model, MOI.ConstraintPrimal(), con_index)
- set = MOI.get(model, MOI.ConstraintSet(), con_index)
- μ = MOI.get(dual_model, MOI.VariablePrimal(), map[con_index][1])
-
- if is_less_than(set)
- # μ[i]*(Ax - b)[i] = 0
- @assert μ*(con_value - set.upper) < 1e-10
- else
- # μ[j]*x[j] = 0
- @assert μ*(con_value - set.lower) < 1e-10
- end
-end
-```
-
-
-```julia
-
-```
diff --git a/docs/src/solve-QP.md b/docs/src/solve-QP.md
deleted file mode 100644
index 7ad7a8808..000000000
--- a/docs/src/solve-QP.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Solving QP primal
-
-
-```julia
-using Random
-using MathOptInterface
-using Dualization
-using OSQP
-
-const MOI = MathOptInterface
-const MOIU = MathOptInterface.Utilities;
-```
-
-
-```julia
-n = 20 # variable dimension
-m = 15 # no of inequality constraints
-p = 15; # no of equality constraints
-```
-
-## create a non-trivial QP problem
-$$\text{min } \frac{1}{2}x^TQx + q^Tx$$
-$$\text{s.t. }Gx <= h$$
- $$Ax = b$$
-
-
-```julia
-x̂ = rand(n)
-Q = rand(n, n)
-Q = Q'*Q # ensure PSD
-q = rand(n)
-G = rand(m, n)
-h = G*x̂ + rand(m)
-A = rand(p, n)
-b = A*x̂;
-```
-
-
-```julia
-model = MOI.instantiate(OSQP.Optimizer, with_bridge_type=Float64)
-x = MOI.add_variables(model, n);
-```
-
-
-```julia
-# define objective
-
-quad_terms = MOI.ScalarQuadraticTerm{Float64}[]
-for i in 1:n
- for j in i:n # indexes (i,j), (j,i) will be mirrored. specify only one kind
- push!(quad_terms, MOI.ScalarQuadraticTerm(Q[i,j],x[i],x[j]))
- end
-end
-
-objective_function = MOI.ScalarQuadraticFunction(MOI.ScalarAffineTerm.(q, x),quad_terms,0.0)
-MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarQuadraticFunction{Float64}}(), objective_function)
-MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE)
-```
-
-
-```julia
-# maintain constrain to index map - will be useful later
-constraint_map = Dict()
-
-# add constraints
-for i in 1:m
- ci = MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(G[i,:], x), 0.),MOI.LessThan(h[i]))
- constraint_map[ci] = i
-end
-
-for i in 1:p
- ci = MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(A[i,:], x), 0.),MOI.EqualTo(b[i]))
- constraint_map[ci] = i
-end
-```
-
-
-```julia
-MOI.optimize!(model)
-```
-
-
-```julia
-@assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL]
-```
-
-
-```julia
-x̄ = MOI.get(model, MOI.VariablePrimal(), x);
-```
-
-
-```julia
-# objective value (predicted vs actual) sanity check
-@assert 0.5*x̄'*Q*x̄ + q'*x̄ <= 0.5*x̂'*Q*x̂ + q'*x̂
-```
-
-## find and solve dual problem
-
-| primal | dual |
-|--------|------|
-$$\text{min } \frac{1}{2}x^TQx + q^Tx$$ | $$\text{max } -\frac{1}{2}y^TQ^{-1}y - u^Th - v^Tb$$
-$$\text{s.t. }Gx <= h$$ | $$\text{s.t. } u \geq 0, u \in R^m, v \in R^n$$
- $$Ax = b$$ | $$y = q + G^Tu + A^Tv$$
-
-- Each primal variable becomes a dual constraint
-- Each primal constraint becomes a dual variable
-
-
-```julia
-# NOTE: can't use Ipopt
-# Ipopt.Optimizer doesn't supports accessing MOI.ObjectiveFunctionType
-
-joint_object = dualize(model)
-dual_model_like = joint_object.dual_model # this is MOI.ModelLike, not an MOI.AbstractOptimizer; can't call optimizer on it
-primal_dual_map = joint_object.primal_dual_map;
-```
-
-
-```julia
-# copy the dual model objective, constraints, and variables to an optimizer
-dual_model = MOI.instantiate(OSQP.Optimizer, with_bridge_type=Float64)
-MOI.copy_to(dual_model, dual_model_like)
-
-# solve dual
-MOI.optimize!(dual_model);
-```
-
-
-```julia
-# check if strong duality holds
-@assert abs(MOI.get(model, MOI.ObjectiveValue()) - MOI.get(dual_model, MOI.ObjectiveValue())) <= 1e-1
-```
-
-## derive and verify KKT conditions
-
-
-```julia
-is_equality(set::S) where {S<:MOI.AbstractSet} = false
-is_equality(set::MOI.EqualTo{T}) where T = true
-
-map = primal_dual_map.primal_con_dual_var;
-```
-
-**complimentary slackness**: $$\mu_{i}(G\bar x -h)_i=0 \qquad \text{ where } i=1..m$$
-
-
-```julia
-for con_index in keys(map)
- # NOTE: OSQP.Optimizer doesn't allows access to MOI.ConstraintPrimal
- # That's why I defined a custom map
-
- set = MOI.get(model, MOI.ConstraintSet(), con_index)
- μ = MOI.get(dual_model, MOI.VariablePrimal(), map[con_index][1])
-
- if !is_equality(set)
- # μ[i]*(Gx - h)[i] = 0
- i = constraint_map[con_index]
-
- # println(μ," - ",G[i,:]'*x̄, " - ",h[i])
- # TODO: assertion fails
- @assert μ*(G[i,:]'*x̄ - h[i]) < 1e-1
- end
-end
-```
-
-**primal feasibility**:
-$$(G\bar x -h)_i=0 \qquad \text{ where } i=1..m$$
-$$(A\bar x -b)_j=0 \qquad \text{ where } j=1..p$$
-
-**dual feasibility**:
-$$\mu_i \geq 0 \qquad \text{ where } i=1..m$$
-
-
-```julia
-for con_index in keys(map)
- # NOTE: OSQP.Optimizer doesn't allows access to MOI.ConstraintPrimal
- # That's why I defined a custom map
-
- set = MOI.get(model, MOI.ConstraintSet(), con_index)
- μ = MOI.get(dual_model, MOI.VariablePrimal(), map[con_index][1])
- i = constraint_map[con_index]
-
- if is_equality(set)
- # (Ax - h)[i] = 0
- @assert abs(A[i,:]'*x̄ - b[i]) < 1e-2
- else
- # (Gx - h)[i] = 0
- @assert G[i,:]'*x̄ - h[i] < 1e-2
-
- # μ[i] >= 0
- # TODO: assertion fails
- @assert μ > -1e-2
- end
-end
-```
diff --git a/docs/src/solve-conic-1.md b/docs/src/solve-conic-1.md
index 39c1b1612..59c6d9387 100644
--- a/docs/src/solve-conic-1.md
+++ b/docs/src/solve-conic-1.md
@@ -54,68 +54,9 @@ X \in \mathbb{S}^n: z^T X z \geq 0, \quad \forall z \in \mathbb{R}^n
> Refered from Mosek examples: https://docs.mosek.com/9.2/toolbox/tutorial-sdo-shared.html#example-sdo1
-## Equivalent DiffCP program to differentiate
-```python
-import numpy as np
-import cvxpy as cp
-from scipy import sparse
-import diffcp
-
-A = sparse.csc_matrix((11+1,7+1), dtype=np.float64)
-A[2 , 1] = 1.0
-A[3 , 1] = -1.0
-A[9 , 1] = -0.45
-A[10, 1] = 0.45
-A[11, 1] = -0.45
-A[2 , 2] = 1.0
-A[4 , 2] = -1.0
-A[9 , 2] = -0.8
-A[10, 2] = 0.318198
-A[11, 2] = -0.1
-A[2 , 3] = 1.0
-A[5 , 3] = -1.0
-A[9 , 3] = -0.9
-A[2 , 4] = 1.0
-A[6 , 4] = -1.0
-A[9 , 4] = -0.225
-A[2 , 5] = 1.0
-A[7 , 5] = -1.0
-A[9 , 5] = -0.1125
-A[10, 5] = 0.1125
-A[11, 5] = -0.1125
-A[2 , 6] = 1.0
-A[8 , 6] = -1.0
-A[11, 6] = -0.225
-A[9 , 7] = 1.0
-A[11, 7] = 1.0
-
-A = A[1:, 1:]
-
-# equivalent to: https://github.com/jump-dev/MathOptInterface.jl/blob/master/src/Test/contconic.jl#L2575
-
-cone_dict = {
- diffcp.POS: 7,
- diffcp.PSD: [2],
- diffcp.ZERO: 1
-}
-
-b = np.array([0.0, 10.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0])
-c = np.array([-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -1.0])
-
-x, y, s, D, DT = diffcp.solve_and_derivative(A, b, c, cone_dict)
-print(x) # MOI.VariablePrimal
-print(s) # MOI.ConstraintPrimal
-print(y) # MOI.ConstraintDual
-
-
-dx, dy, ds = D(sparse.csc_matrix(np.ones((11,7))), np.ones(11), np.ones(7))
-print(dx)
-print(ds)
-print(dy)
-```
-
## Equivalent DiffOpt program
-```julia
+- Instantiate the program with constraints
+```@example 1
using SCS
using DiffOpt
using MathOptInterface
@@ -170,6 +111,11 @@ c2 = MOI.add_constraint(
),
MOI.Zeros(1)
)
+nothing # hide
+```
+
+- Add objective and solve
+```@example 1
objXidx = [1:3; 5:6]
objXcoefs = 2*ones(5)
@@ -185,20 +131,25 @@ x_sol = MOI.get(model, MOI.VariablePrimal(), vcat(X, x))
s_sol = MOI.get(model, MOI.ConstraintPrimal(), [cX, cx, c1, c2])
y_sol = MOI.get(model, MOI.ConstraintDual(), [cX, cx, c1, c2])
-println("x -> ", round.(x_sol; digits=3))
-println("s -> ", round.(s_sol; digits=3))
-println("y -> ", round.(y_sol; digits=3))
+@show x_sol
+@show s_sol
+@show y_sol
+nothing # hide
+```
+
+- Now differentiate with perturbation
+```@example 1
# perturbations in all the parameters
-fx = MOI.SingleVariable.(x)
+fx = MOI.SingleVariable.(vcat(X, x))
MOI.set(model,
DiffOpt.ForwardInConstraint(), c1, MOI.Utilities.vectorize(ones(1, 9) * fx + ones(1)))
MOI.set(model,
- DiffOpt.ForwardInConstraint(), c2, MOI.Utilities.vectorize(ones(6, 9) * fx + ones(6)))
+ DiffOpt.ForwardInConstraint(), cX, MOI.Utilities.vectorize(ones(6, 9) * fx + ones(6)))
MOI.set(model,
- DiffOpt.ForwardInConstraint(), c3, MOI.Utilities.vectorize(ones(3, 9) * fx + ones(3)))
+ DiffOpt.ForwardInConstraint(), cx, MOI.Utilities.vectorize(ones(3, 9) * fx + ones(3)))
MOI.set(model,
- DiffOpt.ForwardInConstraint(), c4, MOI.Utilities.vectorize(ones(1, 9) * fx + ones(1)))
+ DiffOpt.ForwardInConstraint(), c2, MOI.Utilities.vectorize(ones(1, 9) * fx + ones(1)))
# differentiate and get the gradients
DiffOpt.forward(model)
@@ -206,7 +157,6 @@ DiffOpt.forward(model)
dx = MOI.get.(model,
DiffOpt.ForwardOutVariablePrimal(), vcat(X, x))
-println("dx -> ", round.(dx; digits=3))
-# println("ds -> ", round.(ds; digits=3))
-# println("dy -> ", round.(dy; digits=3))
+@show dx
+nothing # hide
```
diff --git a/docs/src/usage.md b/docs/src/usage.md
index 91833656f..07e255698 100644
--- a/docs/src/usage.md
+++ b/docs/src/usage.md
@@ -2,18 +2,19 @@
Create a differentiable model from [existing optimizers](https://www.juliaopt.org/JuMP.jl/stable/installation/)
```julia
- using DiffOpt
- using SCS
-
- model = diff_optimizer(SCS.Optimizer)
+using DiffOpt
+using SCS
+using JuMP
+
+model = diff_optimizer(SCS.Optimizer)
```
Update and solve the model
```julia
- x = MOI.add_variables(model, 2)
- c = MOI.add_constraint(model, ...)
-
- MOI.optimize!(model)
+x = MOI.add_variables(model, 2)
+c = MOI.add_constraint(model, ...)
+
+MOI.optimize!(model)
```
Finally differentiate the model (primal and dual variables specifically) to obtain product of jacobians with respect to problem parameters and a backward pass vector. Currently `DiffOpt` supports two backends for differentiating a model:
diff --git a/examples/autotuning-ridge.jl b/examples/autotuning-ridge.jl
new file mode 100644
index 000000000..ec7112242
--- /dev/null
+++ b/examples/autotuning-ridge.jl
@@ -0,0 +1,177 @@
+"""
+ Source code for the example given in autotuning-ridge.md
+"""
+
+using DiffOpt
+using Statistics
+using OSQP
+using JuMP
+import Random
+using LinearAlgebra
+using Printf
+Base.show(io::IO, f::Float64) = @printf(io, "%1.3f", f) # to reduce float precision while printing
+
+# optional Plot to integrate in tests
+# set ENV["BUILD_PLOT"] = "1" to build plots
+const should_plot = get(ENV, "BUILD_PLOT", nothing) == "1"
+if should_plot
+ using Plots
+end
+
+"""
+ Return the coefficient of determination R2 of the prediction.
+ best possible score is 1.0
+ it can be negative (because the model can be arbitrarily worse)
+"""
+function R2(y_true, y_pred)
+ u = sum((y_pred - y_true).^2) # Regression sum of squares
+ v = sum((y_true .- mean(y_true)).^2) # Total sum of squares
+
+ return 1-(u/v)
+end
+
+function create_problem(N, D, noise)
+ w = rand(D)
+ X = rand(N, D)
+
+ # if noise=0, then there is no need of regularization
+ # and alpha=0 will give the best R2 pred score
+ Y = X*w .+ noise*randn(N)
+
+ # test train split
+ l = Int(N*0.5)
+ return X[1:l, :], X[l+1:N, :], Y[1:l], Y[l+1:N]
+end
+
+X_train, X_test, Y_train, Y_test = create_problem(800, 30, 4);
+
+function fitRidge(X,Y,α)
+ model = Model(() -> diff_optimizer(OSQP.Optimizer))
+
+ N, D = size(X)
+
+ # add variables
+ @variable(model, w[1:D]>=-10)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+
+ @objective(
+ model,
+ Min,
+ sum((Y - X*w).*(Y - X*w))/(2.0*N) + α*(w'w),
+ )
+
+ optimize!(model)
+
+ loss = objective_value(model)
+ return model, w, loss, value.(w)
+end
+
+αs = [0.0, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 7e-2, 2e-1, 3e-1, .5, .7, 1.0]
+Rs = Float64[]
+mse = Float64[]
+
+for α in αs
+ _, _, _, w_train = fitRidge(X_train, Y_train, α)
+ Y_pred = X_test*w_train
+ push!(Rs, R2(Y_test, Y_pred))
+ push!(mse, sum((Y_pred - Y_test).^2))
+end
+
+if should_plot
+ plot(log.(αs), Rs*10, label="R2 prediction score", xaxis = ("log(α)"))
+ plot(log.(αs), mse, label="MSE", xaxis = ("log(α)"))
+end
+
+function ∇model(model, X_train, w, ŵ, α)
+ N, D = size(X_train)
+ dw = zeros(D)
+ ∂w_∂α = zeros(D)
+
+ for i in 1:D
+ dw[i] = 1.0 #set
+
+ MOI.set(
+ model,
+ DiffOpt.ForwardInObjective(),
+ MOI.ScalarQuadraticFunction(
+ [MOI.ScalarAffineTerm(0.0, w[i].index)],
+ [MOI.ScalarQuadraticTerm(dw[i]*α, w[i].index, w[i].index)],
+ 0.0
+ )
+ )
+
+ DiffOpt.forward(model) # find grad
+
+ ∂w_∂α[i] = MOI.get(
+ model,
+ DiffOpt.ForwardOutVariablePrimal(),
+ w[i]
+ )
+
+ dw[i] = 0.0 #unset
+ end
+ return sqrt(ŵ'ŵ) + 2α*(ŵ'∂w_∂α) - sum((X_train*∂w_∂α).*(Y_train - X_train*ŵ))/(2*N)
+end
+
+∂l_∂αs = Float64[]
+N, D = size(X_train)
+
+for α in αs
+ model, w, _, ŵ = fitRidge(X_train, Y_train, α)
+
+ # testing optimality
+ ∂l_∂w = [2*α*ŵ[i] - sum(X_train[:,i].*(Y_train - X_train*ŵ))/N for i in 1:D]
+ @assert norm(∂l_∂w) < 1e-1
+
+ push!(
+ ∂l_∂αs,
+ ∇model(model, X_train, w, ŵ, α)
+ )
+end
+
+if should_plot
+ plot(αs, ∂l_∂αs, label="∂l/∂α", xaxis = ("α"))
+end
+
+"""
+ start from initial value of regularization constant
+ do gradient descent on alpha
+ until the MSE keeps on decreasing
+"""
+function descent(α, max_iters=25)
+ prev_mse = 1e7
+ curr_mse = 1e6
+
+ α_s = Float64[]
+ mse = Float64[]
+
+ iter=0
+ κ = 0.01
+ while curr_mse-10 < prev_mse && iter < max_iters
+ iter += 1
+ model, w, _, ŵ = fitRidge(X_train, Y_train, α)
+
+ #update
+ ∂α = ∇model(model, X_train, w, ŵ, α)
+
+ α += κ*∂α # update step
+
+ push!(α_s, α)
+
+ Y_pred = X_test*ŵ
+
+ prev_mse = curr_mse
+ curr_mse = sum((Y_pred - Y_test).^2)
+
+ push!(mse, curr_mse)
+ end
+
+ return α_s, mse
+end
+
+ᾱ, msē = descent(1.0);
+
+if should_plot
+ plot(log.(αs), mse, label="MSE", xaxis = ("α"))
+ plot!(log.(ᾱ), msē, label="G.D. for α", lw = 2)
+end
diff --git a/examples/chainrules.jl b/examples/chainrules.jl
index 5ac83747b..3c56cf4d8 100644
--- a/examples/chainrules.jl
+++ b/examples/chainrules.jl
@@ -3,6 +3,7 @@ using Clp
using DiffOpt
using Test
using ChainRulesCore
+using LinearAlgebra
# This script creates the JuMP model for a small unit commitment instance
# represented in a solution map function taking parameters as arguments and returning
diff --git a/examples/custom-relu.jl b/examples/custom-relu.jl
new file mode 100644
index 000000000..1421986c5
--- /dev/null
+++ b/examples/custom-relu.jl
@@ -0,0 +1,106 @@
+"""
+ Source code for the example given in custom-relu.md
+"""
+
+using Statistics
+using DiffOpt
+using Flux
+using Flux: onehotbatch, onecold, crossentropy, throttle
+using Base.Iterators: repeated
+using OSQP
+using JuMP
+using ChainRulesCore
+
+## prepare data
+imgs = Flux.Data.MNIST.images()
+labels = Flux.Data.MNIST.labels();
+
+# Preprocessing
+X = hcat(float.(reshape.(imgs, :))...) #stack all the images
+Y = onehotbatch(labels, 0:9); # just a common way to encode categorical variables
+
+test_X = hcat(float.(reshape.(Flux.Data.MNIST.images(:test), :))...)
+test_Y = onehotbatch(Flux.Data.MNIST.labels(:test), 0:9)
+
+# float64 to float16, to save memory
+X = convert(Array{Float16,2}, X)
+test_X = convert(Array{Float16,2}, test_X)
+
+X = X[:, 1:1000]
+Y = Y[:, 1:1000];
+
+"""
+ relu method for a Matrix
+"""
+function matrix_relu(y::AbstractMatrix{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T}
+ x̂ = zeros(T, size(y))
+
+ # model init
+ N = length(y[:, 1])
+ empty!(model)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+ @variable(model, x[1:N] >= zero(T))
+
+ for i in 1:size(y)[2]
+ @objective(
+ model,
+ Min,
+ x'x -2x'y[:, i]
+ )
+ optimize!(model)
+ x̂[:, i] = value.(x)
+ end
+ return x̂
+end
+
+function ChainRulesCore.rrule(::typeof(matrix_relu), y::AbstractArray{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T}
+
+ pv = matrix_relu(y, model=model)
+
+ function pullback_matrix_relu(dx)
+ x = model[:x]
+ dy = zeros(T, size(dx))
+
+ for i in 1:size(y)[2]
+ MOI.set.(
+ model,
+ DiffOpt.BackwardInVariablePrimal(),
+ x,
+ dx[:, i]
+ )
+
+ DiffOpt.backward(model) # find grad
+
+ # fetch the objective expression
+ obj_exp = MOI.get(model, DiffOpt.BackwardOutObjective())
+
+ dy[:, i] = JuMP.coefficient.(obj_exp, x) # coeff of `x` in -2x'y
+ dy[:, i] = -2 * dy[:, i]
+ end
+
+ return (NO_FIELDS, dy)
+ end
+ return pv, pullback_matrix_relu
+end
+
+m = Chain(
+ Dense(784, 64),
+ matrix_relu,
+ Dense(64, 10),
+ softmax
+)
+
+custom_loss(x, y) = crossentropy(m(x), y)
+opt = ADAM(); # popular stochastic gradient descent variant
+
+accuracy(x, y) = mean(onecold(m(x)) .== onecold(y)) # cute way to find average of correct guesses
+
+dataset = repeated((X,Y), 20) # repeat the data set, very low accuracy on the orig dataset
+evalcb = () -> @show(custom_loss(X, Y)) # callback to show loss
+
+Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU
+
+@show accuracy(X,Y)
+@show accuracy(test_X, test_Y);
+
+
diff --git a/examples/custom-svm.jl b/examples/custom-svm.jl
new file mode 100644
index 000000000..ba70748fa
--- /dev/null
+++ b/examples/custom-svm.jl
@@ -0,0 +1,114 @@
+"""
+ Source code for the example given in custom-svm.md
+"""
+
+using Statistics
+using DiffOpt
+using Flux
+using Flux: onecold, binarycrossentropy, throttle, logitcrossentropy, crossentropy
+using Base.Iterators: repeated
+using JuMP
+using SCS
+using CSV
+using DataFrames
+using ChainRulesCore
+using HTTP
+
+const DATA_URL = "https://raw.githubusercontent.com/be-apt/jump-gsoc-2020/master/titanic_preprocessed.csv"
+labels = NaN; # hack for the SVM
+
+"""
+ SVM as a Flux layer
+"""
+function SVM(X::AbstractMatrix{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T}
+ D, N = size(X)
+
+ Y = vec([y >= 0.5 ? 1 : -1 for y in labels]')
+ # scale class from 0,1 to -1,1. required for SVM
+
+ # model init
+ empty!(model)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+
+ # add variables
+ @variable(model, l[1:N])
+ @variable(model, w[1:D])
+ @variable(model, b)
+
+ @constraint(model, cons, Y.*(X'*w .+ b) + l.-1 ∈ MOI.Nonnegatives(N))
+ @constraint(model, 1.0*l ∈ MOI.Nonnegatives(N));
+
+ @objective(
+ model,
+ Min,
+ sum(l),
+ )
+
+ optimize!(model)
+
+ wv = value.(w)
+ bv = value(b)
+
+ return (X'*wv .+ bv)' #prediction
+end
+
+function ChainRulesCore.rrule(::typeof(SVM), X::AbstractArray{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T}
+
+ predictions = SVM(X, model=model)
+
+ """
+ model[:w], model[:b] are the weights of this layer
+ they are not updated using backward pass
+ since they can be computed to an accurate degree using a solver
+ """
+ function pullback_SVM(dX)
+ dy = zeros(T, size(dX)) # since w#
+ return (NO_FIELDS, dy)
+ end
+ return predictions, pullback_SVM
+end
+
+function fetchProblem(;split_ratio::Float64)
+ df = CSV.File(HTTP.get(DATA_URL).body) |> DataFrame
+
+ Y = df[:, 2]
+ X = df[!, 3:12]
+ X = Matrix(X)'
+
+ D, N = size(X)
+
+ l = Int(floor(length(Y)*split_ratio))
+ return X[:, 1:l], X[:, l+1:N], Y[1:l]', Y[l+1:N]'
+end
+X_train, X_test, Y_train, Y_test = fetchProblem(split_ratio=0.8)
+D = size(X_train)[1];
+
+m = Chain(
+ Dense(D, 16, relu),
+ Dropout(0.5),
+ SVM
+# Dense(32, 1, σ),
+);
+
+custom_loss(x, y) = logitcrossentropy(m(x), y)
+opt = ADAM(); # popular stochastic gradient descent variant
+
+classify(x::Float64) = (x>=0.5) ? 1 : 0
+
+function accuracy(x, y_true)
+ y_pred = classify.(m(x))
+ return sum(y_true .≈ y_pred) / length(y_true)
+end
+
+dataset = repeated((X_train,Y_train), 1) # repeat the data set, very low accuracy on the orig dataset
+evalcb = () -> @show(custom_loss(X_train,Y_train)) # callback to show loss
+
+labels = Y_train # needed for SVM
+for iter in 1:1
+ Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU
+end
+
+@show accuracy(X_train, Y_train)
+
+labels = Y_test # needed for SVM
+@show accuracy(X_test, Y_test)
diff --git a/examples/sensitivity-SVM.jl b/examples/sensitivity-SVM.jl
index 1c739a5b6..866b17950 100644
--- a/examples/sensitivity-SVM.jl
+++ b/examples/sensitivity-SVM.jl
@@ -10,6 +10,7 @@ using LinearAlgebra
using MathOptInterface
const MOI = MathOptInterface;
+const MOIU = MOI.Utilities;
# optional Plot to integrate in tests
# set ENV["SVM_PLOT"] = "1" to build plots
@@ -18,11 +19,11 @@ if should_plot
using Plots
end
-N = 50
+N = 100
D = 2
-Random.seed!(rand(1:100))
-X = vcat(randn(N, D), randn(N,D) .+ [4.0,1.5]')
-y = append!(ones(N), -ones(N));
+Random.seed!(62)
+X = vcat(randn(N÷2, D), randn(N÷2,D) .+ [4.0,1.5]')
+y = append!(ones(N÷2), -ones(N÷2));
model = diff_optimizer(SCS.Optimizer)
diff --git a/examples/sensitivity-analysis-ridge.jl b/examples/sensitivity-analysis-ridge.jl
new file mode 100644
index 000000000..2f72c8931
--- /dev/null
+++ b/examples/sensitivity-analysis-ridge.jl
@@ -0,0 +1,109 @@
+"""
+ Source code for the example given in sensitivity-analysis-ridge.md
+"""
+
+import Random
+import OSQP
+using DiffOpt
+using JuMP
+using LinearAlgebra
+
+# optional Plot to integrate in tests
+# set ENV["BUILD_PLOT"] = "1" to build plots
+const should_plot = get(ENV, "BUILD_PLOT", nothing) == "1"
+if should_plot
+ using Plots
+end
+
+function create_problem(N=100)
+ m = 2*abs(randn())
+ b = rand()
+ X = randn(N)
+ Y = m*X .+ b + 0.8*randn(N)
+
+ return X, Y
+end
+
+X, Y = create_problem();
+
+function fitRidge(X,Y,alpha=0.1)
+ model = Model(() -> diff_optimizer(OSQP.Optimizer))
+
+ # add variables
+ @variable(model, w)
+ @variable(model, b)
+ set_optimizer_attribute(model, MOI.Silent(), true)
+
+ @objective(
+ model,
+ Min,
+ sum((Y - w*X .- b).*(Y - w*X .- b)) + alpha*(sum(w*w)+sum(b*b)),
+ )
+
+ optimize!(model)
+
+ loss = objective_value(model)
+ return model, w, b, loss, value(w), value(b)
+end
+
+model, w, b, loss_train, ŵ, b̂ = fitRidge(X, Y)
+
+# plot the regressing line
+if should_plot
+ p = Plots.scatter(X, Y, label="")
+ mi, ma = minimum(X), maximum(X)
+ Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="")
+end
+
+# get the gradients
+∇ = zero(X)
+for i in 1:length(X)
+ # MOI.set(
+ # model,
+ # DiffOpt.ForwardInObjective(),
+ # w,
+ # -2*(Y[i] + X[i])
+ # )
+ # MOI.set(
+ # model,
+ # DiffOpt.ForwardInObjective(),
+ # w,
+ # w,
+ # 2*X[i]
+ # )
+
+ MOI.set(
+ model,
+ DiffOpt.ForwardInObjective(),
+ MOI.ScalarQuadraticFunction(
+ [MOI.ScalarAffineTerm(-2(Y[i] + X[i]), w.index)],
+ [MOI.ScalarQuadraticTerm(2X[i], w.index, w.index)],
+ 0.0
+ )
+ )
+
+ DiffOpt.forward(model)
+
+ db = MOI.get(
+ model,
+ DiffOpt.ForwardOutVariablePrimal(),
+ b
+ )
+
+ ∇[i] = db
+end
+normalize!(∇);
+
+
+# plot the data-point sensitivities
+if should_plot
+ p = Plots.scatter(
+ X, Y,
+ color=[x>0 ? :red : :blue for x in ∇],
+ markersize=[25*abs(x) for x in ∇],
+ label=""
+ )
+ mi, ma = minimum(X), maximum(X)
+ Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="")
+end
+
diff --git a/examples/solve-LP.jl b/examples/solve-LP.jl
deleted file mode 100644
index 45c9473f9..000000000
--- a/examples/solve-LP.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-using Random
-using GLPK
-using MathOptInterface
-
-const MOI = MathOptInterface
-const MOIU = MathOptInterface.Utilities;
-
-using Test
-
-D = 10 # variable dimension
-N = 20 # no of inequality constraints
-
-s = rand(N)
-s = 2*s.-1
-λ = max.(-s, 0)
-s = max.(s, 0)
-x̂ = rand(D)
-A = rand(N, D)
-b = A*x̂ + s
-c = -A' * λ
-
-# can feed dual problem to optimizer like this:
-# model = MOI.instantiate(dual_optimizer(GLPK.Optimizer), with_bridge_type=Float64)
-
-model = GLPK.Optimizer()
-x = MOI.add_variables(model, D)
-
-# define objective
-objective_function = MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(c, x), 0.0)
-MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarAffineFunction{Float64}}(), objective_function)
-MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE)
-
-# will be useful later
-constraint_indices = []
-
-# set constraints
-for i in 1:N
- push!(constraint_indices, MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(A[i,:], x), 0.),MOI.LessThan(b[i])))
-end
-
-for i in 1:D
- push!(constraint_indices, MOI.add_constraint(model,MOI.SingleVariable(x[i]),MOI.GreaterThan(0.)))
-end
-
-MOI.optimize!(model)
-
-@assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL]
-
-x̄ = MOI.get(model, MOI.VariablePrimal(), x); # solution
-
-@assert abs(c'x̄ - c'x̂) <= 1e-8 # sanity check
-
-
-##
-## Checking KKT conditions
-##
-
-is_less_than(set::S) where {S<:MOI.AbstractSet} = false
-is_less_than(set::MOI.LessThan{T}) where T = true
-
-for con_index in constraint_indices
- con_value = MOI.get(model, MOI.ConstraintPrimal(), con_index)
- set = MOI.get(model, MOI.ConstraintSet(), con_index)
- μ = MOI.get(model, MOI.ConstraintDual(), con_index)
-
- if is_less_than(set)
- # μ[i]*(Ax - b)[i] = 0
- @assert μ*(con_value - set.upper) < 1e-10
- else
- # μ[j]*x[j] = 0
- @assert μ*(con_value - set.lower) < 1e-10
- end
-end
diff --git a/examples/solve-QP.jl b/examples/solve-QP.jl
deleted file mode 100644
index 926e21241..000000000
--- a/examples/solve-QP.jl
+++ /dev/null
@@ -1,86 +0,0 @@
-using Random
-using Test
-
-using MathOptInterface
-const MOI = MathOptInterface
-const MOIU = MathOptInterface.Utilities;
-
-using Ipopt
-
-n = 20 # variable dimension
-m = 15 # no of inequality constraints
-
-x̂ = rand(n)
-Q = rand(n, n)
-Q = Q' * Q # ensure PSD
-q = rand(n)
-G = rand(m, n)
-h = G * x̂ + rand(m);
-
-model = MOI.instantiate(Ipopt.Optimizer, with_bridge_type=Float64)
-x = MOI.add_variables(model, n);
-
-# define objective
-
-quad_terms = MOI.ScalarQuadraticTerm{Float64}[]
-for i in 1:n
- for j in i:n # indexes (i,j), (j,i) will be mirrored. specify only one kind
- push!(quad_terms, MOI.ScalarQuadraticTerm(Q[i,j],x[i],x[j]))
- end
-end
-
-objective_function = MOI.ScalarQuadraticFunction(MOI.ScalarAffineTerm.(q, x),quad_terms,0.0)
-MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarQuadraticFunction{Float64}}(), objective_function)
-MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE)
-
-# maintain constrain to index map - will be useful later
-constraint_indices = [
- MOI.add_constraint(
- model,
- MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(G[i,:], x), 0.),MOI.LessThan(h[i]),
- ) for i in 1:m
-]
-
-
-MOI.optimize!(model)
-
-@assert MOI.get(model, MOI.TerminationStatus()) in (MOI.LOCALLY_SOLVED, MOI.OPTIMAL)
-
-x̄ = MOI.get(model, MOI.VariablePrimal(), x)
-
-# objective value (predicted vs actual) sanity check
-@test 0.5*x̄'*Q*x̄ + q'*x̄ <= 0.5*x̂'*Q*x̂ + q'*x̂
-
-# NOTE: can't use Ipopt
-# Ipopt.Optimizer doesn't supports accessing MOI.ObjectiveFunctionType
-
-#
-# Verifying KKT Conditions
-#
-
-# complimentary slackness + dual feasibility
-for i in 1:size(constraint_indices)[1]
- con_index = constraint_indices[i]
- μ_i = MOI.get(model, MOI.ConstraintDual(), con_index)
-
- # μ[i] * (G * x - h)[i] = 0
- @test abs(μ_i * (G[i,:]' * x̄ - h[i])) < 3e-2
-
- # μ[i] <= 0
- @test μ_i <= 1e-2
-end
-
-
-# checking stationarity
-for j in 1:n
- G_mu_sum = 0
-
- for i in 1:size(constraint_indices)[1]
- con_index = constraint_indices[i]
- μ_i = MOI.get(model, MOI.ConstraintDual(), con_index)
-
- G_mu_sum += μ_i * G[i,j]
- end
-
- @test abs(G_mu_sum - (Q * x̄ + q)[j]) < 1e-2
-end
diff --git a/src/diff_opt.jl b/src/diff_opt.jl
index 8339144ec..8b74c243b 100644
--- a/src/diff_opt.jl
+++ b/src/diff_opt.jl
@@ -128,7 +128,7 @@ A `MOI.ScalarQuadraticFunction` can only be used in linearly constrained
quadratic models.
For instance, if the objective contains `θ * (x + 2y)`, for the purpose of
-computinig the derivative with respect to `θ`, the following should be set:
+computing the derivative with respect to `θ`, the following should be set:
```julia
fx = MOI.SingleVariable(x)
fy = MOI.SingleVariable(y)
diff --git a/test/runtests.jl b/test/runtests.jl
index dfeae4f8c..756c67d5d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -23,11 +23,14 @@ const ATOL = 1e-4
const RTOL = 1e-4
@testset "Examples" begin
- include(joinpath(@__DIR__, "../examples/solve-LP.jl"))
- include(joinpath(@__DIR__, "../examples/solve-QP.jl"))
+ include(joinpath(@__DIR__, "../examples/autotuning-ridge.jl"))
+ include(joinpath(@__DIR__, "../examples/chainrules.jl"))
+ include(joinpath(@__DIR__, "../examples/custom-relu.jl"))
+ include(joinpath(@__DIR__, "../examples/custom-svm.jl"))
include(joinpath(@__DIR__, "../examples/unit_example.jl"))
+ include(joinpath(@__DIR__, "../examples/sensitivity-analysis-ridge.jl"))
include(joinpath(@__DIR__, "../examples/sensitivity-SVM.jl"))
- include(joinpath(@__DIR__, "../examples/chainrules.jl"))
+ include(joinpath(@__DIR__, "../examples/unit_example.jl"))
end
@testset "Generate random problems" begin