diff --git a/Project.toml b/Project.toml
index a3689608f..42b083c15 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,5 +23,5 @@ JuMP = "1"
 LazyArrays = "0.21, 0.22, 1"
 MathOptInterface = "1.18"
 MathOptSetDistances = "0.2.9"
-ParametricOptInterface = "0.9.0"
+ParametricOptInterface = "0.12.1"
 julia = "1.6"
diff --git a/README.md b/README.md
index 039894448..5913411db 100644
--- a/README.md
+++ b/README.md
@@ -33,15 +33,12 @@ examples, tutorials, and an API reference.
 
 ### DiffOpt-JuMP API with `Parameters`
 
+Here is an example with a Parametric **Linear Program**:
+
 ```julia
 using JuMP, DiffOpt, HiGHS
 
-model = Model(
-    () -> DiffOpt.diff_optimizer(
-        HiGHS.Optimizer;
-        with_parametric_opt_interface = true,
-    ),
-)
+model = DiffOpt.quadratic_diff_model(HiGHS.Optimizer)
 set_silent(model)
 
 p_val = 4.0
@@ -64,9 +61,9 @@ optimize!(model)
 
 # differentiate w.r.t. p
 direction_p = 3.0
-MOI.set(model, DiffOpt.ForwardConstraintSet(), ParameterRef(p), Parameter(direction_p))
+DiffOpt.set_forward_parameter(model, p, direction_p)
 DiffOpt.forward_differentiate!(model)
-@show MOI.get(model, DiffOpt.ForwardVariablePrimal(), x) == direction_p * 3 / pc_val
+@show DiffOpt.get_forward_variable(model, x) == direction_p * 3 / pc_val
 
 # update p and pc
 p_val = 2.0
@@ -82,45 +79,30 @@ optimize!(model)
 DiffOpt.empty_input_sensitivities!(model)
 # differentiate w.r.t. pc
 direction_pc = 10.0
-MOI.set(model, DiffOpt.ForwardConstraintSet(), ParameterRef(pc), Parameter(direction_pc))
+DiffOpt.set_forward_parameter(model, pc, direction_pc)
 DiffOpt.forward_differentiate!(model)
-@show abs(MOI.get(model, DiffOpt.ForwardVariablePrimal(), x) -
+@show abs(DiffOpt.get_forward_variable(model, x) -
     -direction_pc * 3 * p_val / pc_val^2) < 1e-5
 
 # always a good practice to clear previously set sensitivities
 DiffOpt.empty_input_sensitivities!(model)
 # Now, reverse model AD
 direction_x = 10.0
-MOI.set(model, DiffOpt.ReverseVariablePrimal(), x, direction_x)
+DiffOpt.set_reverse_variable(model, x, direction_x)
 DiffOpt.reverse_differentiate!(model)
-@show MOI.get(model, DiffOpt.ReverseConstraintSet(), ParameterRef(p)) == MOI.Parameter(direction_x * 3 / pc_val)
-@show abs(MOI.get(model, DiffOpt.ReverseConstraintSet(), ParameterRef(pc)).value -
-    -direction_x * 3 * p_val / pc_val^2) < 1e-5
+@show DiffOpt.get_reverse_parameter(model, p) == direction_x * 3 / pc_val
+@show DiffOpt.get_reverse_parameter(model, pc) == -direction_x * 3 * p_val / pc_val^2
 ```
 
-### Low level DiffOpt-JuMP API:
-
-A brief example:
+Available models:
+* `DiffOpt.quadratic_diff_model`: Quadratic Programs (QP) and Linear Programs
+(LP)
+* `DiffOpt.conic_diff_model`: Conic Programs (CP) and Linear Programs (LP)
+* `DiffOpt.nonlinear_diff_model`: Nonlinear Programs (NLP), Quadratic Program
+(QP) and Linear Programs (LP)
+* `DiffOpt.diff_model`: Nonlinear Programs (NLP), Conic Programs (CP),
+Quadratic Programs (QP) and Linear Programs (LP)
 
-```julia
-using JuMP, DiffOpt, HiGHS
-# Create a model using the wrapper
-model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
-# Define your model and solve it
-@variable(model, x)
-@constraint(model, cons, x >= 3)
-@objective(model, Min, 2x)
-optimize!(model)
-# Choose the problem parameters to differentiate with respect to, and set their
-# perturbations.
-MOI.set(model, DiffOpt.ReverseVariablePrimal(), x, 1.0)
-# Differentiate the model
-DiffOpt.reverse_differentiate!(model)
-# fetch the gradients
-grad_exp = MOI.get(model, DiffOpt.ReverseConstraintFunction(), cons)  # -3 x - 1
-constant(grad_exp)        # -1
-coefficient(grad_exp, x)  # -3
-```
 
 ## Citing DiffOpt.jl
 
diff --git a/docs/Project.toml b/docs/Project.toml
index c4112c75d..fa1740b93 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -9,6 +9,8 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+ParametricOptInterface = "0ce4ce61-57bf-432b-a095-efac525d185e"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/docs/src/examples/Thermal_Generation_Dispatch_Example_new.jl b/docs/src/examples/Thermal_Generation_Dispatch_Example_new.jl
new file mode 100644
index 000000000..9e83504ed
--- /dev/null
+++ b/docs/src/examples/Thermal_Generation_Dispatch_Example_new.jl
@@ -0,0 +1,191 @@
+# # Thermal Generation Dispatch Example
+
+#md # [![](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](@__REPO_ROOT_URL__/examples/Thermal_Generation_Dispatch_Example.jl)
+
+# This example illustrates the sensitivity analysis of thermal generation dispatch problem.
+
+# This problem can be described as the choice of thermal generation `g` given a demand `d`, a price for thermal generation `c` and a penalty price `c_{ϕ}` for any demand not attended ϕ.
+
+# ```math
+# \begin{split}
+# \begin{array} {ll}
+# \mbox{minimize} & \sum_{i=1}^{N} c_{i} g_{i} + c_{\phi} \phi \\
+# \mbox{s.t.} & g_{i} \ge 0 \quad i=1..N  \\
+#             & g_{i} \le G_{i} \quad i=1..N  \\
+#             & \sum_{i=1}^{N} g_{i} + \phi = d\\
+# \end{array}
+# \end{split}
+# ```
+# where
+# - `G_{i}` is the maximum possible generation for a thermal generator `i`
+
+# ## Define and solve the Thermal Dispatch Problem
+
+# First, import the libraries.
+
+using Test
+using JuMP
+import DiffOpt
+import LinearAlgebra: dot
+import HiGHS
+import MathOptInterface as MOI
+import Plots
+
+# Define the model that will be construct given a set of parameters.
+
+function generate_model(
+    d_data::Float64;
+    g_sup::Vector{Float64},
+    c_g::Vector{Float64},
+    c_ϕ::Float64,
+)
+    ## Creation of the Model and Parameters
+    model = DiffOpt.quadratic_diff_model(HiGHS.Optimizer)
+    set_silent(model)
+    I = length(g_sup)
+
+    ## Variables
+    @variable(model, g[i in 1:I] >= 0.0)
+    @variable(model, ϕ >= 0.0)
+
+    ## Parameters
+    @variable(model, d in Parameter(d_data))
+
+    ## Constraints
+    @constraint(model, limit_constraints_sup[i in 1:I], g[i] <= g_sup[i])
+    @constraint(model, demand_constraint, sum(g) + ϕ == d)
+
+    ## Objectives
+    @objective(model, Min, dot(c_g, g) + c_ϕ * ϕ)
+
+    ## Solve the model
+    optimize!(model)
+
+    ## Return the solved model
+    return model
+end
+
+# Define the functions that will get the primal values `g` and `\phi` and sensitivity analysis of the demand `dg/dd` and `d\phi/dd` from a optimized model.
+
+function diff_forward(model::Model, ϵ::Float64 = 1.0)
+    ## Initialization of parameters and references to simplify the notation
+    vect_ref = [model[:g]; model[:ϕ]]
+
+    ## Get the primal solution of the model
+    vect = value.(vect_ref)
+
+    ## Reset the sensitivities of the model
+    DiffOpt.empty_input_sensitivities!(model)
+
+    ## Pass the perturbation to the DiffOpt Framework
+    DiffOpt.set_forward_parameter(model, model[:d], ϵ)
+
+    ## Compute the derivatives with the Forward Mode
+    DiffOpt.forward_differentiate!(model)
+
+    ## Get the derivative of the model
+    dvect = DiffOpt.get_forward_variable.(model, vect_ref)
+
+    ## Return the values as a vector
+    return [vect; dvect]
+end
+
+function diff_reverse(model::Model, ϵ::Float64 = 1.0)
+    ## Initialization of parameters and references to simplify the notation
+    vect_ref = [model[:g]; model[:ϕ]]
+
+    ## Get the primal solution of the model
+    vect = value.(vect_ref)
+
+    ## Set variables needed for the DiffOpt Backward Framework
+    dvect = Array{Float64,1}(undef, length(vect_ref))
+
+    ## Loop for each primal variable
+    for i in 1:I+1
+        ## Reset the sensitivities of the model
+        DiffOpt.empty_input_sensitivities!(model)
+
+        ## Pass the perturbation to the DiffOpt Framework
+        DiffOpt.set_reverse_variable.(model, vect_ref[i], ϵ)
+
+        ## Compute the derivatives with the Forward Mode
+        DiffOpt.reverse_differentiate!(model)
+
+        ## Get the derivative of the model
+        dvect[i] = DiffOpt.get_reverse_parameter(model, model[:d])
+    end
+
+    ## Return the values as a vector
+    return [vect; dvect]
+end
+
+# Initialize of Parameters
+
+g_sup = [10.0, 20.0, 30.0]
+I = length(g_sup)
+d = 0.0:0.1:80
+d_size = length(d)
+c_g = [1.0, 3.0, 5.0]
+c_ϕ = 10.0;
+
+# Generate models for each demand `d`
+models = generate_model.(d; g_sup = g_sup, c_g = c_g, c_ϕ = c_ϕ);
+
+# Get the results of models with the DiffOpt Forward and Backward context
+
+result_forward = diff_forward.(models)
+optimize!.(models)
+result_reverse = diff_reverse.(models);
+
+# Organization of results to plot
+# Initialize data_results that will contain every result
+data_results = Array{Float64,3}(undef, 2, d_size, 2 * (I + 1));
+
+# Populate the data_results array
+for k in 1:d_size
+    data_results[1, k, :] = result_forward[k]
+    data_results[2, k, :] = result_reverse[k]
+end
+
+# ## Results with Plot graphs
+# ### Results for the forward context
+# Result Primal Values:
+Plots.plot(
+    d,
+    data_results[1, :, 1:I+1];
+    title = "Generation by Demand",
+    label = ["Thermal Generation 1" "Thermal Generation 2" "Thermal Generation 3" "Generation Deficit"],
+    xlabel = "Demand [unit]",
+    ylabel = "Generation [unit]",
+)
+
+# Result Sensitivity Analysis:
+Plots.plot(
+    d,
+    data_results[1, :, I+2:2*(I+1)];
+    title = "Sensitivity of Generation by Demand",
+    label = ["T. Gen. 1 Sensitivity" "T. Gen. 2 Sensitivity" "T. Gen. 3 Sensitivity" "Gen. Deficit Sensitivity"],
+    xlabel = "Demand [unit]",
+    ylabel = "Sensitivity [-]",
+)
+
+# ### Results for the reverse context
+# Result Primal Values:
+Plots.plot(
+    d,
+    data_results[2, :, 1:I+1];
+    title = "Generation by Demand",
+    label = ["Thermal Generation 1" "Thermal Generation 2" "Thermal Generation 3" "Generation Deficit"],
+    xlabel = "Demand [unit]",
+    ylabel = "Generation [unit]",
+)
+
+# Result Sensitivity Analysis:
+Plots.plot(
+    d,
+    data_results[2, :, I+2:2*(I+1)];
+    title = "Sensitivity of Generation by Demand",
+    label = ["T. Gen. 1 Sensitivity" "T. Gen. 2 Sensitivity" "T. Gen. 3 Sensitivity" "Gen. Deficit Sensitivity"],
+    xlabel = "Demand [unit]",
+    ylabel = "Sensitivity [-]",
+)
diff --git a/docs/src/examples/autotuning-ridge_new.jl b/docs/src/examples/autotuning-ridge_new.jl
new file mode 100644
index 000000000..b76206fb9
--- /dev/null
+++ b/docs/src/examples/autotuning-ridge_new.jl
@@ -0,0 +1,226 @@
+# # Auto-tuning Hyperparameters (JuMP API)
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/autotuning-ridge.jl)
+
+# This example shows how to learn a hyperparameter in Ridge Regression using a gradient descent routine.
+# Let the regularized regression problem be formulated as:
+
+# ```math
+# \begin{equation}
+# \min_{w} \quad \frac{1}{2nd} \sum_{i=1}^{n} (w^T x_{i} - y_i)^2 + \frac{\alpha}{2d} \| w \|_2^2
+# \end{equation}
+# ```
+
+# where
+# - `x`, `y` are the data points
+# - `w` are the learned weights
+# - `α` is the hyperparameter acting on regularization.
+
+# The main optimization model will be formulated with JuMP.
+# Using the gradient of the optimal weights with respect to the regularization parameters
+# computed with DiffOpt, we can perform a gradient descent on top of the inner model
+# to minimize the test loss.
+
+# This tutorial uses the following packages
+
+using JuMP     # The mathematical programming modelling language
+import DiffOpt # JuMP extension for differentiable optimization
+import Ipopt   # Optimization solver that handles quadratic programs
+import LinearAlgebra
+import Plots
+import Random
+
+# ## Generating a noisy regression dataset
+
+Random.seed!(42)
+
+N = 100
+D = 20
+noise = 5
+
+w_real = 10 * randn(D)
+X = 10 * randn(N, D)
+y = X * w_real + noise * randn(N)
+l = N ÷ 2  # test train split
+
+X_train = X[1:l, :]
+X_test = X[l+1:N, :]
+y_train = y[1:l]
+y_test = y[l+1:N];
+
+# ## Defining the regression problem
+
+# We implement the regularized regression problem as a function taking the problem data,
+# building a JuMP model and solving it.
+# Note the cubic term in the objective function (`α * dot(w, w)`),
+# currently this is not handled by ParametricOptInterface smoothly,
+# so we use Ipopt as the solver that support parameters as part of
+# nonlinear (here only cubic) objective functions.
+
+function build_fit_ridge(X, y, α_val = 1.0)
+    model = DiffOpt.nonlinear_diff_model(Ipopt.Optimizer)
+    set_silent(model)
+    N, D = size(X)
+    @variable(model, w[1:D])
+    @variable(model, α in Parameter(α_val))
+    @expression(model, err_term, X * w - y)
+    @objective(
+        model,
+        Min,
+        LinearAlgebra.dot(err_term, err_term) / (2 * N * D) +
+        α * LinearAlgebra.dot(w, w) / (2 * D),
+    )
+    return model
+end
+
+function optimize_fit_ridge!(model, α_val)
+    set_parameter_value(model[:α], α_val)
+    optimize!(model)
+    return value.(model[:w])
+end
+
+# We can solve the problem for several values of α
+# to visualize the effect of regularization on the testing and training loss.
+
+αs = 0.00:0.01:0.50
+mse_test = Float64[]
+mse_train = Float64[]
+model = build_fit_ridge(X, y)
+(Ntest, D) = size(X_test)
+(Ntrain, D) = size(X_train)
+for α in αs
+    ŵ = optimize_fit_ridge!(model, α)
+    ŷ_test = X_test * ŵ
+    ŷ_train = X_train * ŵ
+    push!(mse_test, LinearAlgebra.norm(ŷ_test - y_test)^2 / (2 * Ntest * D))
+    push!(
+        mse_train,
+        LinearAlgebra.norm(ŷ_train - y_train)^2 / (2 * Ntrain * D),
+    )
+end
+
+# Visualize the Mean Score Error metric
+
+Plots.plot(
+    αs,
+    mse_test ./ sum(mse_test);
+    label = "MSE test",
+    xaxis = "α",
+    yaxis = "MSE",
+    legend = (0.8, 0.2),
+    width = 3,
+)
+Plots.plot!(
+    αs,
+    mse_train ./ sum(mse_train);
+    label = "MSE train",
+    linestyle = :dash,
+    width = 3,
+)
+Plots.title!("Normalized MSE on training and testing sets")
+
+# ## Leveraging differentiable optimization: computing the derivative of the solution
+
+# Using DiffOpt, we can compute `∂w_i/∂α`, the derivative of the learned solution `̂w`
+# w.r.t. the regularization parameter.
+
+function compute_dw_dα(model, w)
+    D = length(w)
+    dw_dα = zeros(D)
+    DiffOpt.set_forward_parameter(model, model[:α], 1.0)
+    DiffOpt.forward_differentiate!(model)
+    for i in 1:D
+        dw_dα[i] = DiffOpt.get_forward_variable(model, w[i])
+    end
+    return dw_dα
+end
+
+# Using `∂w_i/∂α` computed with `compute_dw_dα`,
+# we can compute the derivative of the test loss w.r.t. the parameter α
+# by composing derivatives.
+
+function d_testloss_dα(model, X_test, y_test, ŵ)
+    N, D = size(X_test)
+    dw_dα = compute_dw_dα(model, model[:w])
+    err_term = X_test * ŵ - y_test
+    return sum(eachindex(err_term)) do i
+        return LinearAlgebra.dot(X_test[i, :], dw_dα) * err_term[i]
+    end / (N * D)
+end
+
+# We can define a meta-optimizer function performing gradient descent
+# on the test loss w.r.t. the regularization parameter.
+
+function descent(α0, max_iters = 100; fixed_step = 0.01, grad_tol = 1e-3)
+    α_s = Float64[]
+    ∂α_s = Float64[]
+    test_loss = Float64[]
+    α = α0
+    N, D = size(X_test)
+    model = build_fit_ridge(X_train, y_train)
+    for iter in 1:max_iters
+        ŵ = optimize_fit_ridge!(model, α)
+        err_term = X_test * ŵ - y_test
+        ∂α = d_testloss_dα(model, X_test, y_test, ŵ)
+        push!(α_s, α)
+        push!(∂α_s, ∂α)
+        push!(test_loss, LinearAlgebra.norm(err_term)^2 / (2 * N * D))
+        α -= fixed_step * ∂α
+        if abs(∂α) ≤ grad_tol
+            break
+        end
+    end
+    return α_s, ∂α_s, test_loss
+end
+
+ᾱ, ∂ᾱ, msē = descent(0.10, 500)
+iters = 1:length(ᾱ);
+
+# Visualize gradient descent and convergence
+
+Plots.plot(
+    αs,
+    mse_test;
+    label = "MSE test",
+    xaxis = ("α"),
+    legend = :topleft,
+    width = 2,
+)
+Plots.plot!(ᾱ, msē; label = "learned α", width = 5, style = :dot)
+Plots.title!("Regularizer learning")
+
+# Visualize the convergence of α to its optimal value
+
+Plots.plot(
+    iters,
+    ᾱ;
+    label = nothing,
+    color = :blue,
+    xaxis = ("Iterations"),
+    legend = :bottom,
+    title = "Convergence of α",
+)
+
+# Visualize the convergence of the objective function
+
+Plots.plot(
+    iters,
+    msē;
+    label = nothing,
+    color = :red,
+    xaxis = ("Iterations"),
+    legend = :bottom,
+    title = "Convergence of MSE",
+)
+
+# Visualize the convergence of the derivative to zero
+
+Plots.plot(
+    iters,
+    ∂ᾱ;
+    label = nothing,
+    color = :green,
+    xaxis = ("Iterations"),
+    legend = :bottom,
+    title = "Convergence of ∂α",
+)
diff --git a/docs/src/examples/chainrules_unit_new.jl b/docs/src/examples/chainrules_unit_new.jl
new file mode 100644
index 000000000..5b56d0338
--- /dev/null
+++ b/docs/src/examples/chainrules_unit_new.jl
@@ -0,0 +1,297 @@
+# # ChainRules integration demo: Relaxed Unit Commitment
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/chainrules_unit.jl)
+
+# In this example, we will demonstrate the integration of DiffOpt with
+# [ChainRulesCore.jl](https://juliadiff.org/ChainRulesCore.jl/stable/),
+# the library allowing the definition of derivatives for functions
+# that can then be used by automatic differentiation systems.
+
+using JuMP
+import DiffOpt
+import Plots
+import LinearAlgebra: ⋅
+import HiGHS
+import ChainRulesCore
+
+# ## Unit commitment problem
+
+# We will consider a unit commitment problem, finding the cost-minimizing activation
+# of generation units in a power network over multiple time periods.
+# The considered constraints include:
+# - Demand satisfaction of several loads
+# - Ramping constraints
+# - Generation limits.
+
+# The decisions are:
+# - ``u_{it} \in \{0,1\}``: activation of the ``i``-th unit at time ``t``
+# - ``p_{it}``: power output of the ``i``-th unit at time ``t``.
+
+# DiffOpt handles convex optimization problems only, we therefore
+# relax the domain of the ``u_{it}`` variables to ``\left[0,1\right]``.
+
+# ## Primal UC problem
+
+# ChainRules defines the differentiation of functions.
+# The actual function that is differentiated in the context of DiffOpt is the
+# solution map taking in input the problem parameters and returning the solution.
+
+function unit_commitment(
+    _load1_demand,
+    _load2_demand,
+    gen_costs,
+    noload_costs;
+    model = Model(HiGHS.Optimizer),
+    silent = false,
+)
+    MOI.set(model, MOI.Silent(), silent)
+
+    ## Problem data
+    units = [1, 2] # Generator identifiers
+    load_names = ["Load1", "Load2"] # Load identifiers
+    n_periods = 4 # Number of time periods
+    Pmin = Dict(1 => fill(0.5, n_periods), 2 => fill(0.5, n_periods)) # Minimum power output (pu)
+    Pmax = Dict(1 => fill(3.0, n_periods), 2 => fill(3.0, n_periods)) # Maximum power output (pu)
+    RR = Dict(1 => 0.25, 2 => 0.25) # Ramp rates (pu/min)
+    P0 = Dict(1 => 0.0, 2 => 0.0) # Initial power output (pu)
+
+    ## Parameters
+    @variable(model, load1_demand[1:n_periods] in Parameter.(_load1_demand)) # Load 1 demand (pu)
+    @variable(model, load2_demand[1:n_periods] in Parameter.(_load2_demand)) # Load 2 demand (pu)
+    D = Dict("Load1" => load1_demand, "Load2" => load2_demand)
+    @variable(model, Cp[1:2] in Parameter.(gen_costs)) # Generation costs ($/pu)
+    @variable(model, Cnl[1:2] in Parameter.(noload_costs)) # No-load costs ($)
+
+    ## Variables
+    ## Note: u represents the activation of generation units.
+    ## Would be binary in the typical UC problem, relaxed here to u ∈ [0,1]
+    ## for a linear relaxation.
+    @variable(model, 0 <= u[g in units, t in 1:n_periods] <= 1) # Commitment
+    @variable(model, p[g in units, t in 1:n_periods] >= 0) # Power output (pu)
+
+    ## Constraints
+
+    ## Energy balance
+    @constraint(
+        model,
+        energy_balance_cons[t in 1:n_periods],
+        sum(p[g, t] for g in units) == sum(D[l][t] for l in load_names),
+    )
+
+    ## Generation limits
+    @constraint(
+        model,
+        [g in units, t in 1:n_periods],
+        Pmin[g][t] * u[g, t] <= p[g, t]
+    )
+    @constraint(
+        model,
+        [g in units, t in 1:n_periods],
+        p[g, t] <= Pmax[g][t] * u[g, t]
+    )
+
+    ## Ramp rates
+    @constraint(
+        model,
+        [g in units, t in 2:n_periods],
+        p[g, t] - p[g, t-1] <= 60 * RR[g]
+    )
+    @constraint(model, [g in units], p[g, 1] - P0[g] <= 60 * RR[g])
+    @constraint(
+        model,
+        [g in units, t in 2:n_periods],
+        p[g, t-1] - p[g, t] <= 60 * RR[g]
+    )
+    @constraint(model, [g in units], P0[g] - p[g, 1] <= 60 * RR[g])
+
+    ## Objective
+    @objective(
+        model,
+        Min,
+        sum(
+            (Cp[g] * p[g, t]) + (Cnl[g] * u[g, t]) for g in units,
+            t in 1:n_periods
+        ),
+    )
+
+    optimize!(model)
+    ## asserting finite optimal value
+    @assert termination_status(model) == MOI.OPTIMAL
+    ## converting to dense matrix
+    return JuMP.value.(p.data)
+end
+
+m = Model(HiGHS.Optimizer)
+@show unit_commitment(
+    [1.0, 1.2, 1.4, 1.6],
+    [1.0, 1.2, 1.4, 1.6],
+    [1000.0, 1500.0],
+    [500.0, 1000.0],
+    model = m,
+    silent = true,
+)
+
+# ## Perturbation of a single input parameter
+
+# Let us vary the demand at the second time frame on both loads:
+
+demand_values = 0.05:0.05:3.0
+pvalues = map(demand_values) do di
+    return unit_commitment(
+        [1.0, di, 1.4, 1.6],
+        [1.0, di, 1.4, 1.6],
+        [1000.0, 1500.0],
+        [500.0, 1000.0];
+        silent = true,
+    )
+end
+pflat = [getindex.(pvalues, i) for i in eachindex(pvalues[1])];
+
+# The influence of this variation of the demand is piecewise linear on the
+# generation at different time frames:
+
+Plots.scatter(demand_values, pflat; xaxis = ("Demand"), yaxis = ("Generation"))
+Plots.title!("Different time frames and generators")
+Plots.xlims!(0.0, 3.5)
+
+# ## Forward Differentiation
+
+# Forward differentiation rule for the solution map of the unit commitment problem.
+# It takes as arguments:
+# 1. the perturbations on the input parameters
+# 2. the differentiated function
+# 3. the primal values of the input parameters,
+
+# and returns a tuple `(primal_output, perturbations)`, the main primal result
+# and the perturbation propagated to this result:
+
+function ChainRulesCore.frule(
+    (_, Δload1_demand, Δload2_demand, Δgen_costs, Δnoload_costs),
+    ::typeof(unit_commitment),
+    load1_demand,
+    load2_demand,
+    gen_costs,
+    noload_costs;
+    optimizer = HiGHS.Optimizer,
+)
+    ## creating the UC model with a DiffOpt optimizer wrapper around HiGHS
+    model = DiffOpt.diff_model(optimizer)
+    ## building and solving the main model
+    pv = unit_commitment(
+        load1_demand,
+        load2_demand,
+        gen_costs,
+        noload_costs;
+        model = model,
+    )
+    ## Setting perturbations in the parameters
+    DiffOpt.set_forward_parameter.(model, model[:load1_demand], Δload1_demand)
+    DiffOpt.set_forward_parameter.(model, model[:load2_demand], Δload2_demand)
+    DiffOpt.set_forward_parameter.(model, model[:Cp], Δgen_costs)
+    DiffOpt.set_forward_parameter.(model, model[:Cnl], Δnoload_costs)
+    ## computing the forward differentiation
+    DiffOpt.forward_differentiate!(model)
+    ## querying the corresponding perturbation of the decision
+    Δp = DiffOpt.get_forward_variable.(model, model[:p])
+    return (pv, Δp.data)
+end
+
+# We can now compute the perturbation of the output powers `Δpv`
+# for a perturbation of the first load demand at time 2:
+
+load1_demand = [1.0, 1.0, 1.4, 1.6]
+load2_demand = [1.0, 1.0, 1.4, 1.6]
+gen_costs = [1000.0, 1500.0]
+noload_costs = [500.0, 1000.0];
+
+# all input perturbations are 0
+# except first load at time 2
+Δload1_demand = 0 * load1_demand
+Δload1_demand[2] = 1.0
+Δload2_demand = 0 * load2_demand
+Δgen_costs = 0 * gen_costs
+Δnoload_costs = 0 * noload_costs
+(pv, Δpv) = ChainRulesCore.frule(
+    (nothing, Δload1_demand, Δload2_demand, Δgen_costs, Δnoload_costs),
+    unit_commitment,
+    load1_demand,
+    load2_demand,
+    gen_costs,
+    noload_costs,
+)
+
+Δpv
+
+# The result matches what we observe in the previous figure:
+# the generation of the first generator at the second time frame (third element on the plot).
+
+# # Reverse-mode differentiation of the solution map
+
+# The `rrule` returns the primal and a pullback.
+# The pullback takes a seed for the optimal solution `̄p` and returns
+# derivatives with respect to each input parameter of the function.
+
+function ChainRulesCore.rrule(
+    ::typeof(unit_commitment),
+    load1_demand,
+    load2_demand,
+    gen_costs,
+    noload_costs;
+    optimizer = HiGHS.Optimizer,
+    silent = false,
+)
+    model = DiffOpt.diff_model(optimizer)
+    ## solve the forward UC problem
+    pv = unit_commitment(
+        load1_demand,
+        load2_demand,
+        gen_costs,
+        noload_costs;
+        model = model,
+        silent = silent,
+    )
+    function pullback_unit_commitment(pb)
+        ## set sensitivities
+        DiffOpt.set_reverse_variable.(model, model[:p], pb)
+        ## compute the gradients
+        DiffOpt.reverse_differentiate!(model)
+        ## retrieve the gradients with respect to the parameters
+        dload1_demand =
+            DiffOpt.get_reverse_parameter.(model, model[:load1_demand])
+        dload2_demand =
+            DiffOpt.get_reverse_parameter.(model, model[:load2_demand])
+        dgen_costs = DiffOpt.get_reverse_parameter.(model, model[:Cp])
+        dnoload_costs = DiffOpt.get_reverse_parameter.(model, model[:Cnl])
+        return (dload1_demand, dload2_demand, dgen_costs, dnoload_costs)
+    end
+    return (pv, pullback_unit_commitment)
+end
+
+# We can set a seed of one on the power of the first generator at the second time frame and zero for all other
+# parts of the solution:
+
+(pv, pullback_unit_commitment) = ChainRulesCore.rrule(
+    unit_commitment,
+    load1_demand,
+    load2_demand,
+    gen_costs,
+    noload_costs;
+    optimizer = HiGHS.Optimizer,
+    silent = true,
+)
+dpv = 0 * pv
+dpv[1, 2] = 1
+dargs = pullback_unit_commitment(dpv)
+(dload1_demand, dload2_demand, dgen_costs, dnoload_costs) = dargs;
+
+# The sensitivities with respect to the load demands are:
+dload1_demand
+
+# and:
+dload2_demand
+
+# The sensitivity of the generation is propagated to the sensitivity of both
+# loads at the second time frame.
+
+# This example integrating ChainRules was designed with support
+# from [Invenia Technical Computing](https://www.invenia.ca/).
diff --git a/docs/src/examples/custom-relu_new.jl b/docs/src/examples/custom-relu_new.jl
new file mode 100644
index 000000000..160bde4f5
--- /dev/null
+++ b/docs/src/examples/custom-relu_new.jl
@@ -0,0 +1,128 @@
+# # Custom ReLU layer
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/custom-relu.jl)
+
+# We demonstrate how DiffOpt can be used to generate a simple neural network
+# unit - the ReLU layer. A neural network is created using Flux.jl and
+# trained on the MNIST dataset.
+
+# This tutorial uses the following packages
+
+using JuMP
+import DiffOpt
+import Ipopt
+import ChainRulesCore
+import Flux
+import MLDatasets
+import Statistics
+import Base.Iterators: repeated
+using LinearAlgebra
+
+# ## The ReLU and its derivative
+
+# Define a relu through an optimization problem solved by a quadratic solver.
+# Return the solution of the problem.
+# TODO: use HiGHS
+function matrix_relu(
+    y_data::Matrix;
+    model = DiffOpt.nonlinear_diff_model(Ipopt.Optimizer),
+)
+    layer_size, batch_size = size(y_data)
+    empty!(model)
+    set_silent(model)
+    @variable(model, x[1:layer_size, 1:batch_size] >= 0)
+    @variable(model, y[1:layer_size, 1:batch_size] in Parameter.(y_data))
+    @objective(model, Min, x[:]'x[:] - 2y[:]'x[:])
+    optimize!(model)
+    return Float32.(value.(x))
+end
+
+# Define the reverse differentiation rule, for the function we defined above.
+function ChainRulesCore.rrule(
+    ::typeof(matrix_relu),
+    y_data::Matrix{T},
+) where {T}
+    model = DiffOpt.nonlinear_diff_model(Ipopt.Optimizer)
+    pv = matrix_relu(y_data; model = model)
+    function pullback_matrix_relu(dl_dx)
+        ## some value from the backpropagation (e.g., loss) is denoted by `l`
+        ## so `dl_dy` is the derivative of `l` wrt `y`
+        x = model[:x]::Matrix{JuMP.VariableRef} # load decision variable `x` into scope
+        y = model[:y]::Matrix{JuMP.VariableRef} # load parameter variable `y` into scope
+        ## set sensitivities (dl/dx)
+        for i in eachindex(x)
+            DiffOpt.set_reverse_variable(model, x[i], dl_dx[i])
+        end
+        ## compute grad (dx/dy)
+        DiffOpt.reverse_differentiate!(model)
+        ## return gradient (dl/dy = dl/dx * dx/dy)
+        dl_dy = DiffOpt.get_reverse_parameter.(model, y)
+        return (ChainRulesCore.NoTangent(), dl_dy)
+    end
+    return pv, pullback_matrix_relu
+end
+
+# For more details about backpropagation, visit [Introduction, ChainRulesCore.jl](https://juliadiff.org/ChainRulesCore.jl/dev/).
+
+# ## Define the network
+
+layer_size = 10
+m = Flux.Chain(
+    Flux.Dense(784, layer_size), # 784 being image linear dimension (28 x 28)
+    matrix_relu,
+    Flux.Dense(layer_size, 10), # 10 being the number of outcomes (0 to 9)
+    Flux.softmax,
+)
+
+# ## Prepare data
+
+N = 1000 # batch size
+## Preprocessing train data
+imgs = MLDatasets.MNIST(; split = :train).features[:, :, 1:N]
+labels = MLDatasets.MNIST(; split = :train).targets[1:N]
+train_X = float.(reshape(imgs, size(imgs, 1) * size(imgs, 2), N)) # stack images
+train_Y = Flux.onehotbatch(labels, 0:9);
+## Preprocessing test data
+test_imgs = MLDatasets.MNIST(; split = :test).features[:, :, 1:N]
+test_labels = MLDatasets.MNIST(; split = :test).targets[1:N];
+test_X = float.(reshape(test_imgs, size(test_imgs, 1) * size(test_imgs, 2), N))
+test_Y = Flux.onehotbatch(test_labels, 0:9);
+
+# Define input data
+# The original data is repeated `epochs` times because `Flux.train!` only
+# loops through the data set once
+
+epochs = 2#50 # ~1 minute (i7 8th gen with 16gb RAM)
+## epochs = 100 # leads to 77.8% in about 2 minutes
+dataset = repeated((train_X, train_Y), epochs);
+
+# ## Network training
+
+# training loss function, Flux optimizer
+custom_loss(m, x, y) = Flux.crossentropy(m(x), y)
+opt = Flux.setup(Flux.Adam(), m)
+
+# Train to optimize network parameters
+
+@time Flux.train!(custom_loss, m, dataset, opt);
+
+# Although our custom implementation takes time, it is able to reach similar
+# accuracy as the usual ReLU function implementation.
+
+# ## Accuracy results
+
+# Average of correct guesses
+
+accuracy(x, y) = Statistics.mean(Flux.onecold(m(x)) .== Flux.onecold(y));
+
+# Training accuracy
+
+accuracy(train_X, train_Y)
+
+# Test accuracy
+
+accuracy(test_X, test_Y)
+
+# Note that the accuracy is low due to simplified training.
+# It is possible to increase the number of samples `N`,
+# the number of epochs `epoch` and the connectivity `inner`.
diff --git a/docs/src/examples/matrix-inversion-manual_new.jl b/docs/src/examples/matrix-inversion-manual_new.jl
new file mode 100644
index 000000000..9400d42a8
--- /dev/null
+++ b/docs/src/examples/matrix-inversion-manual_new.jl
@@ -0,0 +1,150 @@
+# # Differentiating a QP wrt a single variable
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/matrix-inversion-manual.jl)
+
+# Consider the quadratic program
+
+# ```math
+# \begin{split}
+# \begin{array} {ll}
+# \mbox{minimize} & \frac{1}{2} x^T Q x + q^T x \\
+# \mbox{subject to} & G x \leq h, x \in \mathcal{R}^2 \\
+# \end{array}
+# \end{split}
+# ```
+
+# where `Q`, `q`, `G` are fixed and `h` is the single parameter.
+
+# In this example, we'll try to differentiate the QP wrt `h`, by finding its
+# jacobian by hand (using Eqn (6) of [QPTH article](https://arxiv.org/pdf/1703.00443.pdf))
+# and compare the results:
+# - Manual compuation
+# - Using JuMP and DiffOpt
+
+# Assuming 
+# ```
+# Q = [[4, 1], [1, 2]]
+# q = [1, 1]
+# G = [1, 1]
+# ```
+# and begining with a starting value of `h=-1`
+
+# few values just for reference
+
+# | variable | optimal value | note |
+# |----|------|-----|
+# | x* | [-0.25; -0.75] | Primal optimal | 
+# | 𝜆∗ | -0.75 | Dual optimal | 
+
+# ## Finding Jacobian using matrix inversion
+# Lets formulate Eqn (6) of [QPTH article](https://arxiv.org/pdf/1703.00443.pdf) for our QP. If we assume `h` as the only parameter and `Q`,`q`,`G` as fixed problem data - also note that our QP doesn't involves `Ax=b` constraint - then Eqn (6) reduces to 
+# ```math
+# \begin{gather}
+#  \begin{bmatrix} 
+#      Q & G^T \\
+#      \lambda^* G & G x^* - h
+#  \end{bmatrix}
+#  \begin{bmatrix} 
+#      dx \\
+#      d \lambda
+#  \end{bmatrix}
+#  =
+#   \begin{bmatrix}
+#    0 \\
+#    \lambda^* dh
+#    \end{bmatrix}
+# \end{gather}
+# ```
+
+# Now to find the jacobians $$ \frac{\partial x}{\partial h}, \frac{\partial \lambda}{\partial h}$$
+# we substitute `dh = I = [1]` and plug in values of `Q`,`q`,`G` to get
+# ```math
+# \begin{gather}
+#  \begin{bmatrix} 
+#      4 & 1 & 1 \\
+#      1 & 2 & 1 \\
+#      -0.75 & -0.75 & 0
+#  \end{bmatrix}
+#  \begin{bmatrix} 
+#      \frac{\partial x_1}{\partial h} \\
+#      \frac{\partial x_2}{\partial h} \\
+#      \frac{\partial \lambda}{\partial h}
+#  \end{bmatrix}
+#  =
+#   \begin{bmatrix}
+#    0 \\
+#    0 \\
+#    -0.75
+#    \end{bmatrix}
+# \end{gather}
+# ```
+
+# Upon solving using matrix inversion, the jacobian is
+# ```math
+# \frac{\partial x_1}{\partial h} = 0.25, \frac{\partial x_2}{\partial h} = 0.75, \frac{\partial \lambda}{\partial h} = -1.75 
+# ```
+
+# ## Finding Jacobian using JuMP and DiffOpt
+
+using JuMP
+import DiffOpt
+import Ipopt
+
+n = 2 # variable dimension
+m = 1; # no of inequality constraints
+
+Q = [4.0 1.0; 1.0 2.0]
+q = [1.0; 1.0]
+G = [1.0 1.0;]
+h = [-1.0;]   # initial values set
+
+# Initialize empty model
+
+model = DiffOpt.quadratic_diff_model(Ipopt.Optimizer)
+set_silent(model)
+
+# Add the variables
+
+@variable(model, x[1:2])
+
+# Add the parameters
+
+@variable(model, y[1:length(h)] in Parameter.(h))
+
+# Add the constraints.
+
+@constraint(model, cons[j in 1:1], sum(G[j, i] * x[i] for i in 1:2) <= y[j]);
+
+@objective(
+    model,
+    Min,
+    1 / 2 * sum(Q[j, i] * x[i] * x[j] for i in 1:2, j in 1:2) +
+    sum(q[i] * x[i] for i in 1:2)
+)
+
+# Solve problem
+
+optimize!(model)
+
+# primal solution
+
+value.(x)
+
+# dual solution
+
+dual.(cons)
+
+# set sensitivitity
+
+DiffOpt.set_forward_parameter.(model, y, 1.0)
+
+# Compute derivatives
+
+DiffOpt.forward_differentiate!(model)
+
+# Query derivative
+
+dx = DiffOpt.get_forward_variable.(model, x)
+
+using Test                                  #src
+@test dx ≈ [0.25, 0.75] atol = 1e-4 rtol = 1e-4 #src
diff --git a/docs/src/examples/nearest_correlation_new.jl b/docs/src/examples/nearest_correlation_new.jl
new file mode 100644
index 000000000..d22bdd0e9
--- /dev/null
+++ b/docs/src/examples/nearest_correlation_new.jl
@@ -0,0 +1,66 @@
+# # Nearest correlation
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/nearest_correlation.jl)
+
+# This example illustrates the sensitivity analysis of the nearest correlation problem studied in [H02].
+#
+# Higham, Nicholas J.
+# *Computing the nearest correlation matrix—a problem from finance.*
+# IMA journal of Numerical Analysis 22.3 (2002): 329-343.
+
+using DiffOpt, JuMP, SCS, LinearAlgebra
+solver = SCS.Optimizer
+
+function proj(A, dH = Diagonal(ones(size(A, 1))), H_data = ones(size(A)))
+    n = LinearAlgebra.checksquare(A)
+    model = Model(() -> DiffOpt.diff_optimizer(solver))
+    @variable(model, X[1:n, 1:n] in PSDCone())
+    @variable(model, H[1:n, 1:n] in Parameter.(H_data))
+    @variable(model, E[1:n, 1:n])
+    @constraint(model, [i in 1:n], X[i, i] == 1)
+    @constraint(model, E .== (H .* (X .- A)))
+    @objective(model, Min, sum(E .^ 2))
+    for i in 1:n
+        DiffOpt.set_forward_parameter(model, H[i, i], dH[i, i])
+    end
+    optimize!(model)
+    DiffOpt.forward_differentiate!(model)
+    dX = DiffOpt.get_forward_variable.(model, X)
+    return value.(X), dX
+end
+
+# Example from [H02, p. 334-335]:
+
+A = LinearAlgebra.Tridiagonal(ones(2), ones(3), ones(2))
+
+# The projection is computed as follows:
+
+X, dX = proj(A)
+nothing # hide
+
+# The projection of `A` is:
+
+X
+
+# The derivative of the projection with respect to a uniform increase of the weights
+# of the diagonal entries is:
+
+dX
+
+# Example from [H02, Section 4, p. 340]:
+
+A = LinearAlgebra.Tridiagonal(-ones(3), 2ones(4), -ones(3))
+
+# The projection is computed as follows:
+
+X, dX = proj(A)
+nothing # hide
+
+# The projection of `A` is:
+
+X
+
+# The derivative of the projection with respect to a uniform increase of the weights
+# of the diagonal entries is:
+
+dX
diff --git a/docs/src/examples/polyhedral_project_new.jl b/docs/src/examples/polyhedral_project_new.jl
new file mode 100644
index 000000000..aa2c9b91a
--- /dev/null
+++ b/docs/src/examples/polyhedral_project_new.jl
@@ -0,0 +1,171 @@
+# # Polyhedral QP layer
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/polyhedral_project.jl)
+
+# We use DiffOpt to define a custom network layer which, given an input matrix `y`,
+# computes its projection onto a polytope defined by a fixed number of inequalities:
+# `a_i^T x ≥ b_i`.
+# A neural network is created using Flux.jl and trained on the MNIST dataset,
+# integrating this quadratic optimization layer.
+#
+# The QP is solved in the forward pass, and its DiffOpt derivative is used in the backward pass expressed with `ChainRulesCore.rrule`.
+
+# This example is similar to the custom ReLU layer, except that the layer is parameterized
+# by the hyperplanes `(w,b)` and not a simple stateless function.
+# This also means that `ChainRulesCore.rrule` must return the derivatives of the output with respect to the
+# layer parameters to allow for backpropagation.
+
+using JuMP
+import DiffOpt
+import Ipopt
+import ChainRulesCore
+import Flux
+import MLDatasets
+import Statistics
+using Base.Iterators: repeated
+using LinearAlgebra
+using Random
+
+Random.seed!(42)
+
+# ## The Polytope representation and its derivative
+
+struct Polytope{N}
+    w::NTuple{N,Vector{Float64}}
+    b::Vector{Float64}
+end
+
+Polytope(w::NTuple{N}) where {N} = Polytope{N}(w, randn(N))
+
+# We define a "call" operation on the polytope, making it a so-called functor.
+# Calling the polytope with a matrix `y` operates an Euclidean projection of this matrix onto the polytope.
+function (polytope::Polytope{N})(
+    y_data::AbstractMatrix;
+    model = DiffOpt.nonlinear_diff_model(Ipopt.Optimizer),
+) where {N}
+    layer_size, batch_size = size(y_data)
+    empty!(model)
+    set_silent(model)
+    @variable(model, x[1:layer_size, 1:batch_size])
+    @variable(model, y[1:layer_size, 1:batch_size] in Parameter.(y_data))
+    @variable(model, b[idx = 1:N] in Parameter.(polytope.b[idx]))
+    @variable(
+        model,
+        w[idx = 1:N, i = 1:layer_size] in Parameter(polytope.w[idx][i])
+    )
+    @constraint(
+        model,
+        greater_than_cons[idx in 1:N, sample in 1:batch_size],
+        dot(polytope.w[idx], x[:, sample]) ≥ b[idx]
+    )
+    @objective(model, Min, dot(x - y, x - y))
+    optimize!(model)
+    return Float32.(JuMP.value.(x))
+end
+
+# The `@functor` macro from Flux implements auxiliary functions for collecting the parameters of
+# our custom layer and operating backpropagation.
+Flux.@functor Polytope
+
+# Define the reverse differentiation rule, for the function we defined above.
+# Flux uses ChainRules primitives to implement reverse-mode differentiation of the whole network.
+# To learn the current layer (the polytope the layer contains),
+# the gradient is computed with respect to the `Polytope` fields in a ChainRulesCore.Tangent type
+# which is used to represent derivatives with respect to structs.
+# For more details about backpropagation, visit [Introduction, ChainRulesCore.jl](https://juliadiff.org/ChainRulesCore.jl/dev/).
+
+function ChainRulesCore.rrule(
+    polytope::Polytope{N},
+    y_data::AbstractMatrix,
+) where {N}
+    model = DiffOpt.nonlinear_diff_model(Ipopt.Optimizer)
+    xv = polytope(y_data; model = model)
+    function pullback_matrix_projection(dl_dx)
+        dl_dx = ChainRulesCore.unthunk(dl_dx)
+        ##  `dl_dy` is the derivative of `l` wrt `y`
+        x = model[:x]::Matrix{JuMP.VariableRef}
+        y = model[:y]::Matrix{JuMP.VariableRef}
+        w = model[:w]::Matrix{JuMP.VariableRef}
+        b = model[:b]::Vector{JuMP.VariableRef}
+        layer_size, batch_size = size(x)
+        ## set sensitivities
+        for i in eachindex(x)
+            DiffOpt.set_reverse_variable(model, x[i], dl_dx[i])
+        end
+        ## compute grad
+        DiffOpt.reverse_differentiate!(model)
+        ## compute gradient wrt objective function parameter y
+        dl_dy = DiffOpt.get_reverse_parameter.(model, y)
+        ## compute gradient wrt objective function parameter w and b
+        _dl_dw = DiffOpt.get_reverse_parameter.(model, w)
+        dl_dw = zero.(polytope.w)
+        for idx in 1:N
+            dl_dw[idx] .= _dl_dw[idx, :]
+        end
+        dl_db = DiffOpt.get_reverse_parameter.(model, b)
+        dself = ChainRulesCore.Tangent{Polytope{N}}(; w = dl_dw, b = dl_db)
+        return (dself, dl_dy)
+    end
+    return xv, pullback_matrix_projection
+end
+
+# ## Define the Network
+
+layer_size = 20
+m = Flux.Chain(
+    Flux.Dense(784, layer_size), # 784 being image linear dimension (28 x 28)
+    Polytope((randn(layer_size), randn(layer_size), randn(layer_size))),
+    Flux.Dense(layer_size, 10), # 10 being the number of outcomes (0 to 9)
+    Flux.softmax,
+)
+
+# ## Prepare data
+
+M = 500 # batch size
+## Preprocessing train data
+imgs = MLDatasets.MNIST(; split = :train).features[:, :, 1:M]
+labels = MLDatasets.MNIST(; split = :train).targets[1:M]
+train_X = float.(reshape(imgs, size(imgs, 1) * size(imgs, 2), M)) # stack images
+train_Y = Flux.onehotbatch(labels, 0:9);
+## Preprocessing test data
+test_imgs = MLDatasets.MNIST(; split = :test).features[:, :, 1:M]
+test_labels = MLDatasets.MNIST(; split = :test).targets[1:M]
+test_X = float.(reshape(test_imgs, size(test_imgs, 1) * size(test_imgs, 2), M))
+test_Y = Flux.onehotbatch(test_labels, 0:9);
+
+# Define input data
+# The original data is repeated `epochs` times because `Flux.train!` only
+# loops through the data set once
+
+epochs = 5
+dataset = repeated((train_X, train_Y), epochs);
+
+# ## Network training
+
+# training loss function, Flux optimizer
+custom_loss(m, x, y) = Flux.crossentropy(m(x), y)
+opt = Flux.setup(Flux.Adam(), m)
+
+# Train to optimize network parameters
+
+@time Flux.train!(custom_loss, m, dataset, opt);
+
+# Although our custom implementation takes time, it is able to reach similar
+# accuracy as the usual ReLU function implementation.
+
+# ## Accuracy results
+
+# Average of correct guesses
+accuracy(x, y) = Statistics.mean(Flux.onecold(m(x)) .== Flux.onecold(y));
+
+# Training accuracy
+
+accuracy(train_X, train_Y)
+
+# Test accuracy
+
+accuracy(test_X, test_Y)
+
+# Note that the accuracy is low due to simplified training.
+# It is possible to increase the number of samples `N`,
+# the number of epochs `epoch` and the connectivity `inner`.
diff --git a/docs/src/examples/sensitivity-analysis-ridge_new.jl b/docs/src/examples/sensitivity-analysis-ridge_new.jl
new file mode 100644
index 000000000..af91512f2
--- /dev/null
+++ b/docs/src/examples/sensitivity-analysis-ridge_new.jl
@@ -0,0 +1,171 @@
+# # Sensitivity Analysis of Ridge Regression
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/sensitivity-analysis-ridge.jl)
+
+# This example illustrates the sensitivity analysis of data points in a
+# [Ridge Regression](https://en.wikipedia.org/wiki/Ridge_regression) problem.
+# The general form of the problem is given below:
+
+# ```math
+# \begin{split}
+# \begin{array} {ll}
+# \mbox{minimize} & \sum_{i=1}^{N} (y_{i} - w x_{i} - b)^2 + \alpha (w^2 + b^2) \\
+# \end{array}
+# \end{split}
+# ```
+# where
+# - `w`, `b` are slope and intercept of the regressing line
+# - `x`, `y` are the N data points
+# - `α` is the regularization constant
+#
+# which is equivalent to:
+# ```math
+# \begin{split}
+# \begin{array} {ll}
+# \mbox{minimize} & e^{\top}e + \alpha (w^2) \\
+# \mbox{s.t.} & e_{i} = y_{i} - w x_{i} - b \quad \quad i=1..N  \\
+# \end{array}
+# \end{split}
+# ```
+
+# This tutorial uses the following packages
+
+using JuMP
+import DiffOpt
+import Random
+import Ipopt
+import Plots
+using LinearAlgebra: dot
+
+# ## Define and solve the problem
+
+# Construct a set of noisy (guassian) data points around a line.
+
+Random.seed!(42)
+
+N = 150
+
+w_orig = 2 * abs(randn())
+b = rand()
+X = randn(N)
+Y = w_orig * X .+ b + 0.8 * randn(N);
+
+# The helper method `fit_ridge` defines and solves the corresponding model.
+# The ridge regression is modeled with quadratic programming
+# (quadratic objective and linear constraints) and solved in generic methods
+# of Ipopt. This is not the standard way of solving the ridge regression problem
+# this is done here for didactic purposes.
+
+function build_fit_ridge(X_data, Y_data, alpha = 0.1)
+    N = length(Y_data)
+    ## Initialize a JuMP Model with Ipopt solver
+    ## model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer)) # TODO: this is not auto detecting scalar nonlinear function (alpha * w * w)
+    model = DiffOpt.nonlinear_diff_model(Ipopt.Optimizer)
+    set_silent(model)
+    @variable(model, w) # angular coefficient
+    @variable(model, b) # linear coefficient
+    @variable(model, α in Parameter(alpha)) # regularization parameter
+    @variable(model, X[1:N] in Parameter.(X_data))
+    @variable(model, Y[1:N] in Parameter.(Y_data))
+    ## expression defining approximation error
+    @expression(model, e[i = 1:N], Y[i] - w * X[i] - b)
+    ## objective minimizing squared error and ridge penalty
+    @objective(model, Min, 1 / N * dot(e, e) + α * (w^2))
+    optimize!(model)
+    return model
+end
+
+function optimize_fit_ridge!(model, alpha)
+    set_parameter_value(model[:α], alpha)
+    optimize!(model)
+    return model[:w], model[:b]
+end
+
+# Plot the data points and the fitted line for different alpha values
+
+p = Plots.scatter(X, Y; label = nothing, legend = :topleft)
+mi, ma = minimum(X), maximum(X)
+Plots.title!("Fitted lines and points")
+
+model = build_fit_ridge(X, Y)
+for alpha in 0.5:0.5:1.5
+    local w, b = optimize_fit_ridge!(model, alpha)
+    ŵ = value(w)
+    b̂ = value(b)
+    Plots.plot!(
+        p,
+        [mi, ma],
+        [mi * ŵ + b̂, ma * ŵ + b̂];
+        label = "alpha=$alpha",
+        width = 2,
+    )
+end
+p
+
+# ## Differentiate
+
+# Now that we've solved the problem, we can compute the sensitivity of optimal
+# values of the slope `w` with
+# respect to perturbations in the data points (`x`,`y`).
+
+alpha = 0.4
+w, b = optimize_fit_ridge!(model, alpha)
+ŵ = value(w)
+b̂ = value(b)
+
+# Sensitivity with respect to x and y
+
+∇y = zero(X)
+∇x = zero(X)
+for i in 1:N
+    ## X[i] sensitivity
+    DiffOpt.empty_input_sensitivities!(model)
+    DiffOpt.set_forward_parameter(model, model[:X][i], 1.0)
+    DiffOpt.forward_differentiate!(model)
+    ∇x[i] = DiffOpt.get_forward_variable(model, w)
+    ## Y[i] sensitivity
+    DiffOpt.empty_input_sensitivities!(model)
+    DiffOpt.set_forward_parameter(model, model[:Y][i], 1.0)
+    DiffOpt.forward_differentiate!(model)
+    ∇y[i] = DiffOpt.get_forward_variable(model, w)
+end
+
+# Visualize point sensitivities with respect to regression points.
+
+p = Plots.scatter(
+    X,
+    Y;
+    color = [dw < 0 ? :blue : :red for dw in ∇x],
+    markersize = [5 * abs(dw) + 1.2 for dw in ∇x],
+    label = "",
+)
+mi, ma = minimum(X), maximum(X)
+Plots.plot!(
+    p,
+    [mi, ma],
+    [mi * ŵ + b̂, ma * ŵ + b̂];
+    color = :blue,
+    label = "",
+)
+Plots.title!("Regression slope sensitivity with respect to x")
+
+#
+
+p = Plots.scatter(
+    X,
+    Y;
+    color = [dw < 0 ? :blue : :red for dw in ∇y],
+    markersize = [5 * abs(dw) + 1.2 for dw in ∇y],
+    label = "",
+)
+mi, ma = minimum(X), maximum(X)
+Plots.plot!(
+    p,
+    [mi, ma],
+    [mi * ŵ + b̂, ma * ŵ + b̂];
+    color = :blue,
+    label = "",
+)
+Plots.title!("Regression slope sensitivity with respect to y")
+
+# Note the points with less central `x` values induce a greater y sensitivity of the slope.
diff --git a/docs/src/examples/sensitivity-analysis-svm_new.jl b/docs/src/examples/sensitivity-analysis-svm_new.jl
new file mode 100644
index 000000000..a61f75cf5
--- /dev/null
+++ b/docs/src/examples/sensitivity-analysis-svm_new.jl
@@ -0,0 +1,144 @@
+# # Sensitivity Analysis of SVM
+
+#md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/sensitivity-analysis-svm.jl)
+
+# This notebook illustrates sensitivity analysis of data points in a [Support Vector Machine](https://en.wikipedia.org/wiki/Support-vector_machine) (inspired from [@matbesancon](http://github.com/matbesancon)'s [SimpleSVMs](http://github.com/matbesancon/SimpleSVMs.jl).)
+
+# For reference, Section 10.1 of https://online.stat.psu.edu/stat508/book/export/html/792 gives an intuitive explanation of what it means to have a sensitive hyperplane or data point. The general form of the SVM training problem is given below (with $\ell_2$ regularization):
+
+# ```math
+# \begin{split}
+# \begin{array} {ll}
+# \mbox{minimize} & \lambda||w||^2 + \sum_{i=1}^{N} \xi_{i} \\
+# \mbox{s.t.} & \xi_{i} \ge 0 \quad \quad i=1..N  \\
+#             & y_{i} (w^T X_{i} + b) \ge 1 - \xi_{i} \quad i=1..N  \\
+# \end{array}
+# \end{split}
+# ```
+# where
+# - `X`, `y` are the `N` data points
+# - `w` is the support vector
+# - `b` determines the offset `b/||w||` of the hyperplane with normal `w`
+# - `ξ` is the soft-margin loss
+# - `λ` is the $\ell_2$ regularization.
+#
+# This tutorial uses the following packages
+
+using JuMP     # The mathematical programming modelling language
+import DiffOpt # JuMP extension for differentiable optimization
+import Ipopt   # Optimization solver that handles quadratic programs
+import LinearAlgebra
+import Plots
+import Random
+
+# ## Define and solve the SVM
+
+# Construct two clusters of data points.
+
+N = 100
+D = 2
+
+Random.seed!(62)
+X_data = vcat(randn(N ÷ 2, D), randn(N ÷ 2, D) .+ [2.0, 2.0]')
+y = append!(ones(N ÷ 2), -ones(N ÷ 2))
+λ = 0.05;
+
+# Let's initialize a special model that can understand sensitivities
+
+model = DiffOpt.quadratic_diff_model(Ipopt.Optimizer)
+set_silent(model)
+
+# Add the variables
+
+@variable(model, ξ[1:N] >= 0)
+@variable(model, w[1:D])
+@variable(model, b);
+
+# Add the parameters to be differentiated
+
+@variable(model, X[1:N, 1:D] in Parameter.(X_data))
+
+# Add the constraints
+
+@constraint(
+    model,
+    con[i in 1:N],
+    y[i] * (LinearAlgebra.dot(X[i, :], w) + b) >= 1 - ξ[i]
+);
+
+# Define the objective and solve
+
+@objective(model, Min, λ * LinearAlgebra.dot(w, w) + sum(ξ))
+
+optimize!(model)
+
+# We can visualize the separating hyperplane.
+
+loss = objective_value(model)
+
+wv = value.(w)
+
+bv = value(b)
+
+svm_x = [-2.0, 4.0] # arbitrary points
+svm_y = (-bv .- wv[1] * svm_x) / wv[2]
+
+p = Plots.scatter(
+    X_data[:, 1],
+    X_data[:, 2];
+    color = [yi > 0 ? :red : :blue for yi in y],
+    label = "",
+)
+Plots.plot!(
+    p,
+    svm_x,
+    svm_y;
+    label = "loss = $(round(loss, digits=2))",
+    width = 3,
+)
+
+# ## Gradient of hyperplane wrt the data point coordinates
+
+# Now that we've solved the SVM, we can compute the sensitivity of optimal
+# values -- the separating hyperplane in our case -- with respect to
+# perturbations of the problem data -- the data points -- using DiffOpt.
+
+# How does a change in coordinates of the data points, `X`,
+# affects the position of the hyperplane?
+# This is achieved by finding gradients of `w` and `b` with respect to `X[i]`.
+
+# Begin differentiating the model.
+# analogous to varying θ in the expression:
+# ```math
+# y_{i} (w^T (X_{i} + \theta) + b) \ge 1 - \xi_{i}
+# ```
+∇ = zeros(N)
+for i in 1:N
+    for j in 1:N
+        if i == j
+            ## we consider identical perturbations on all x_i coordinates
+            DiffOpt.set_forward_parameter.(model, X[i, :], 1.0)
+        else
+            DiffOpt.set_forward_parameter.(model, X[i, :], 0.0)
+        end
+    end
+    DiffOpt.forward_differentiate!(model)
+    dw = DiffOpt.get_forward_variable.(model, w)
+    db = DiffOpt.get_forward_variable.(model, b)
+    ∇[i] = LinearAlgebra.norm(dw) + LinearAlgebra.norm(db)
+end
+
+# We can visualize the separating hyperplane sensitivity with respect to the data points.
+# Note that all the small numbers were converted into 1/10 of the
+# largest value to show all the points of the set.
+
+p3 = Plots.scatter(
+    X_data[:, 1],
+    X_data[:, 2];
+    color = [yi > 0 ? :red : :blue for yi in y],
+    label = "",
+    markersize = 2 * (max.(1.8∇, 0.2 * maximum(∇))),
+)
+Plots.yaxis!(p3, (-2, 4.5))
+Plots.plot!(p3, svm_x, svm_y; label = "", width = 3)
+Plots.title!("Sensitivity of the separator to data point variations")
diff --git a/src/ConicProgram/ConicProgram.jl b/src/ConicProgram/ConicProgram.jl
index 89a791831..ab9eb20bd 100644
--- a/src/ConicProgram/ConicProgram.jl
+++ b/src/ConicProgram/ConicProgram.jl
@@ -50,6 +50,8 @@ const Form{T} = MOI.Utilities.GenericModel{
         DiffOpt.ProductOfSets{T},
     },
 }
+
+# should the be applied on Model?
 function MOI.supports(
     ::Form{T},
     ::MOI.ObjectiveFunction{F},
@@ -91,7 +93,6 @@ mutable struct Model <: DiffOpt.AbstractModel
     input_cache::DiffOpt.InputCache
 
     x::Vector{Float64} # Primal
-    s::Vector{Float64} # Slack
     y::Vector{Float64} # Dual
     diff_time::Float64
 end
@@ -105,7 +106,6 @@ function Model()
         DiffOpt.InputCache(),
         Float64[],
         Float64[],
-        Float64[],
         NaN,
     )
 end
@@ -121,7 +121,6 @@ function MOI.empty!(model::Model)
     model.back_grad_cache = nothing
     empty!(model.input_cache)
     empty!(model.x)
-    empty!(model.s)
     empty!(model.y)
     model.diff_time = NaN
     return
@@ -141,6 +140,14 @@ function MOI.supports_constraint(
     return MOI.supports_constraint(model.model, F, S)
 end
 
+function MOI.supports_constraint(
+    ::Model,
+    ::Type{MOI.VectorAffineFunction{T}},
+    ::Type{MOI.PositiveSemidefiniteConeSquare},
+) where {T}
+    return false
+end
+
 function MOI.set(
     model::Model,
     ::MOI.ConstraintPrimalStart,
@@ -148,11 +155,7 @@ function MOI.set(
     value,
 )
     MOI.throw_if_not_valid(model, ci)
-    return DiffOpt._enlarge_set(
-        model.s,
-        MOI.Utilities.rows(model.model.constraints, ci),
-        value,
-    )
+    return
 end
 
 function MOI.set(
@@ -189,12 +192,6 @@ function _gradient_cache(model::Model)
         )
     end
 
-    if any(isnan, model.s) || length(model.s) < length(b)
-        error(
-            "Some constraints are missing a value for the `ConstraintPrimalStart` attribute.",
-        )
-    end
-
     if MOI.get(model, MOI.ObjectiveSense()) == MOI.FEASIBILITY_SENSE
         c = SparseArrays.spzeros(size(A, 2))
     else
@@ -216,10 +213,12 @@ function _gradient_cache(model::Model)
     m = A.m
     n = A.n
     N = m + n + 1
+
+    slack = b - A * model.x
     # NOTE: w = 1.0 systematically since we asserted the primal-dual pair is optimal
     # `inv(M)((x, y, 1), (0, s, 0)) = (x, y, 1) - (0, s, 0)`,
     # see Minty parametrization in https://stanford.edu/~boyd/papers/pdf/cone_prog_refine.pdf
-    (u, v, w) = (model.x, model.y - model.s, 1.0)
+    (u, v, w) = (model.x, model.y - slack, 1.0)
 
     # find gradient of projections on dual of the cones
     Dπv = DiffOpt.Dπ(v, model.model, model.model.constraints.sets)
@@ -260,12 +259,12 @@ function DiffOpt.forward_differentiate!(model::Model)
         M = gradient_cache.M
         vp = gradient_cache.vp
         Dπv = gradient_cache.Dπv
-        x = model.x
-        y = model.y
-        s = model.s
         A = gradient_cache.A
         b = gradient_cache.b
         c = gradient_cache.c
+        x = model.x
+        y = model.y
+        slack = b - A * x
 
         objective_function = DiffOpt._convert(
             MOI.ScalarAffineFunction{Float64},
@@ -302,13 +301,14 @@ function DiffOpt.forward_differentiate!(model::Model)
             dAj,
             dAv,
         )
+        dAv .*= -1.0
         dA = SparseArrays.sparse(dAi, dAj, dAv, lines, cols)
 
         m = size(A, 1)
         n = size(A, 2)
         N = m + n + 1
         # NOTE: w = 1 systematically since we asserted the primal-dual pair is optimal
-        (u, v, w) = (x, y - s, 1.0)
+        (u, v, w) = (x, y - slack, 1.0)
 
         # g = dQ * Π(z/|w|) = dQ * [u, vp, 1.0]
         RHS = [
@@ -339,12 +339,12 @@ function DiffOpt.reverse_differentiate!(model::Model)
         M = gradient_cache.M
         vp = gradient_cache.vp
         Dπv = gradient_cache.Dπv
-        x = model.x
-        y = model.y
-        s = model.s
         A = gradient_cache.A
         b = gradient_cache.b
         c = gradient_cache.c
+        x = model.x
+        y = model.y
+        slack = b - A * x
 
         dx = zeros(length(c))
         for (vi, value) in model.input_cache.dx
@@ -357,13 +357,13 @@ function DiffOpt.reverse_differentiate!(model::Model)
         n = size(A, 2)
         N = m + n + 1
         # NOTE: w = 1 systematically since we asserted the primal-dual pair is optimal
-        (u, v, w) = (x, y - s, 1.0)
+        (u, v, w) = (x, y - slack, 1.0)
 
         # dz = D \phi (z)^T (dx,dy,dz)
         dz = [
             dx
             Dπv' * (dy + ds) - ds
-            -x' * dx - y' * dy - s' * ds
+            -x' * dx - y' * dy - slack' * ds
         ]
 
         g = if LinearAlgebra.norm(dz) <= 1e-4 # TODO: parametrize or remove
diff --git a/src/DiffOpt.jl b/src/DiffOpt.jl
index 0dce78d96..27f16d5ea 100644
--- a/src/DiffOpt.jl
+++ b/src/DiffOpt.jl
@@ -48,6 +48,8 @@ function add_default_factorization(model)
     return
 end
 
+include("jump_wrapper.jl")
+
 export diff_optimizer
 
 # TODO
diff --git a/src/NonLinearProgram/NonLinearProgram.jl b/src/NonLinearProgram/NonLinearProgram.jl
index 6b816b4fc..843d576fc 100644
--- a/src/NonLinearProgram/NonLinearProgram.jl
+++ b/src/NonLinearProgram/NonLinearProgram.jl
@@ -113,14 +113,19 @@ function MOI.supports_constraint(
     S<:Union{
         MOI.GreaterThan{Float64},
         MOI.LessThan{Float64},
-        # MOI.Interval{Float64},
         MOI.EqualTo{Float64},
-        MOI.Parameter{Float64},
     },
 }
     return true
 end
 
+function MOI.supports_add_constrained_variable(
+    ::Form,
+    ::Type{MOI.Parameter{T}},
+) where {T}
+    return true
+end
+
 function _add_leq_geq(
     form::Form,
     idx::MOI.ConstraintIndex,
@@ -189,7 +194,7 @@ function MOI.add_constraint(
     form.num_constraints += 1
     p = MOI.Nonlinear.add_parameter(form.model, set.value)
     form.var2param[idx] = p
-    idx_ci = MOI.ConstraintIndex{F,S}(form.num_constraints)
+    idx_ci = MOI.ConstraintIndex{F,S}(idx.value)
     form.var2ci[idx] = idx_ci
     return idx_ci
 end
@@ -294,6 +299,13 @@ function Model()
     )
 end
 
+function MOI.supports_add_constrained_variable(
+    ::Model,
+    ::Type{MOI.Parameter{T}},
+) where {T}
+    return true
+end
+
 _objective_sense(form::Form) = form.sense
 _objective_sense(model::Model) = _objective_sense(model.model)
 
@@ -507,8 +519,8 @@ function DiffOpt.forward_differentiate!(model::Model; tol = 1e-6)
         Δp = zeros(length(cache.params))
         for (i, var_idx) in enumerate(cache.params)
             ky = form.var2ci[var_idx]
-            if haskey(model.input_cache.dp, ky) # only for set sensitivities
-                Δp[i] = model.input_cache.dp[ky]
+            if haskey(model.input_cache.parameter_constraints, ky) # only for set sensitivities
+                Δp[i] = model.input_cache.parameter_constraints[ky]
             end
         end
 
@@ -595,6 +607,7 @@ function MOI.get(
     ci::MOI.ConstraintIndex,
 )
     try
+        # TODO check ci.value's
         idx = model.cache.dual_mapping[ci.value]
         return model.forw_grad_cache.dual_Δs[idx]
     catch
@@ -607,7 +620,10 @@ function MOI.get(
     ::DiffOpt.ReverseConstraintSet,
     ci::MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}},
 ) where {T}
-    return MOI.Parameter{T}(model.back_grad_cache.Δp[ci.value])
+    form = model.model
+    var_idx = MOI.VariableIndex(ci.value)
+    p_idx = form.var2param[var_idx].value
+    return MOI.Parameter{T}(model.back_grad_cache.Δp[p_idx])
 end
 
 end # module NonLinearProgram
diff --git a/src/copy_dual.jl b/src/copy_dual.jl
index a7c24d23b..dbeca2a19 100644
--- a/src/copy_dual.jl
+++ b/src/copy_dual.jl
@@ -87,6 +87,9 @@ function _copy_dual(dest::MOI.ModelLike, src::MOI.ModelLike, index_map)
         MOI.get(src, MOI.VariablePrimal(), vis_src),
     )
     for (F, S) in MOI.get(dest, MOI.ListOfConstraintTypesPresent())
+        if F <: MOI.VariableIndex && S <: MOI.Parameter
+            continue
+        end
         _copy_constraint_start(
             dest,
             src,
diff --git a/src/diff_opt.jl b/src/diff_opt.jl
index b850e72aa..3bb4de79d 100644
--- a/src/diff_opt.jl
+++ b/src/diff_opt.jl
@@ -13,7 +13,6 @@ const MOIDD = MOI.Utilities.DoubleDicts
 
 Base.@kwdef mutable struct InputCache
     dx::Dict{MOI.VariableIndex,Float64} = Dict{MOI.VariableIndex,Float64}()# dz for QP
-    dp::Dict{MOI.ConstraintIndex,Float64} = Dict{MOI.ConstraintIndex,Float64}() # Specifically for NonLinearProgram
     dy::Dict{MOI.ConstraintIndex,Float64} = Dict{MOI.ConstraintIndex,Float64}()
     # Dual sensitivity currently only works for NonLinearProgram
     # ds
@@ -23,6 +22,8 @@ Base.@kwdef mutable struct InputCache
     #       concrete value types.
     # `scalar_constraints` and `vector_constraints` includes `A` and `b` for CPs
     # or `G` and `h` for QPs
+    parameter_constraints::Dict{MOI.ConstraintIndex,Float64} =
+        Dict{MOI.ConstraintIndex,Float64}() # Specifically for NonLinearProgram
     scalar_constraints::MOIDD.DoubleDict{MOI.ScalarAffineFunction{Float64}} =
         MOIDD.DoubleDict{MOI.ScalarAffineFunction{Float64}}() # also includes G for QPs
     vector_constraints::MOIDD.DoubleDict{MOI.VectorAffineFunction{Float64}} =
@@ -33,8 +34,8 @@ end
 
 function Base.empty!(cache::InputCache)
     empty!(cache.dx)
-    empty!(cache.dp)
     empty!(cache.dy)
+    empty!(cache.parameter_constraints)
     empty!(cache.scalar_constraints)
     empty!(cache.vector_constraints)
     cache.objective = nothing
@@ -136,6 +137,16 @@ ConstraintFunction so we consider the expression `θ * (x + 2y - 5)`.
 """
 struct ForwardConstraintFunction <: MOI.AbstractConstraintAttribute end
 
+"""
+    ForwardConstraintSet <: MOI.AbstractConstraintAttribute
+
+A `MOI.AbstractConstraintAttribute` to set input data to forward differentiation, that
+is, problem input data.
+
+Currently, this only works for the set `MOI.Parameter`.
+"""
+struct ForwardConstraintSet <: MOI.AbstractConstraintAttribute end
+
 """
     ForwardVariablePrimal <: MOI.AbstractVariableAttribute
 
@@ -167,10 +178,6 @@ MOI.set(model, DiffOpt.ReverseVariablePrimal(), x)
 """
 struct ReverseVariablePrimal <: MOI.AbstractVariableAttribute end
 
-struct ForwardConstraintSet <: MOI.AbstractConstraintAttribute end
-
-struct ReverseConstraintSet <: MOI.AbstractConstraintAttribute end
-
 """
     ReverseConstraintDual <: MOI.AbstractConstraintAttribute
 
@@ -253,6 +260,18 @@ struct ReverseConstraintFunction <: MOI.AbstractConstraintAttribute end
 
 MOI.is_set_by_optimize(::ReverseConstraintFunction) = true
 
+"""
+    ReverseConstraintSet
+
+An `MOI.AbstractConstraintAttribute` to get output data to reverse differentiation, that
+is, problem input data.
+
+Currently, this only works for the set `MOI.Parameter`.
+"""
+struct ReverseConstraintSet <: MOI.AbstractConstraintAttribute end
+
+MOI.is_set_by_optimize(::ReverseConstraintSet) = true
+
 """
     DifferentiateTimeSec()
 
@@ -273,6 +292,11 @@ Model supporting [`forward_differentiate!`](@ref) and
 """
 abstract type AbstractModel <: MOI.ModelLike end
 
+function empty_input_sensitivities!(model::AbstractModel)
+    empty!(model.input_cache)
+    return
+end
+
 MOI.supports_incremental_interface(::AbstractModel) = true
 
 function MOI.is_valid(model::AbstractModel, idx::MOI.Index)
@@ -287,6 +311,8 @@ function MOI.add_variables(model::AbstractModel, n)
     return MOI.add_variables(model.model, n)
 end
 
+# TODO: add support for add_constrained_variable(s) and supports_
+
 function MOI.Utilities.pass_nonvariable_constraints(
     dest::AbstractModel,
     src::MOI.ModelLike,
@@ -409,6 +435,11 @@ function MOI.set(
     ci::MOI.ConstraintIndex{MOI.ScalarAffineFunction{T},S},
     func::MOI.ScalarAffineFunction{T},
 ) where {T,S}
+    if MOI.supports_add_constrained_variable(model.model, MOI.Parameter{T})
+        error(
+            "The model with type $(typeof(model)) does support Parameters, so setting ForwardConstraintFunction fails.",
+        )
+    end
     model.input_cache.scalar_constraints[ci] = func
     return
 end
@@ -419,10 +450,30 @@ function MOI.set(
     ci::MOI.ConstraintIndex{MOI.VectorAffineFunction{T},S},
     func::MOI.VectorAffineFunction{T},
 ) where {T,S}
+    if MOI.supports_add_constrained_variable(model.model, MOI.Parameter{T})
+        error(
+            "The model with type $(typeof(model)) does support Parameters, so setting ForwardConstraintFunction fails.",
+        )
+    end
     model.input_cache.vector_constraints[ci] = func
     return
 end
 
+function MOI.set(
+    model::AbstractModel,
+    ::ForwardConstraintSet,
+    ci::MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}},
+    set::MOI.Parameter{T},
+) where {T}
+    if !MOI.supports_add_constrained_variable(model.model, MOI.Parameter{T})
+        error(
+            "The model with type $(typeof(model)) does not support Parameters",
+        )
+    end
+    model.input_cache.parameter_constraints[ci] = set.value
+    return
+end
+
 function lazy_combination(op::F, α, a, β, b) where {F<:Function}
     return LazyArrays.ApplyArray(
         op,
@@ -655,6 +706,26 @@ function _push_term(
     return _push_term(I, J, V, neg, r[term.output_index], term.scalar_term)
 end
 
+function MOI.supports(::AbstractModel, ::MOI.Name)
+    return false
+end
+
+function MOI.supports(
+    ::AbstractModel,
+    ::MOI.VariableName,
+    ::Type{MOI.VariableIndex},
+)
+    return false
+end
+
+function MOI.supports(
+    ::AbstractModel,
+    ::MOI.ConstraintName,
+    ::Type{MOI.ConstraintIndex{F,S}},
+) where {F,S}
+    return false
+end
+
 function MOI.supports(model::AbstractModel, attr::MOI.AbstractModelAttribute)
     return MOI.supports(model.model, attr)
 end
diff --git a/src/jump_wrapper.jl b/src/jump_wrapper.jl
new file mode 100644
index 000000000..ce26c4a5c
--- /dev/null
+++ b/src/jump_wrapper.jl
@@ -0,0 +1,145 @@
+"""
+    diff_model(optimizer_constructor; with_bridge_type = Float64, with_cache_type = Float64, with_outer_cache = true)
+
+Create a JuMP model with a differentiable optimizer. The optimizer is created
+using `optimizer_constructor`. This model will try to select the proper
+differentiable optimization method based on the problem structure.
+
+See also: [`nonlinear_diff_model`](@ref), [`conic_diff_model`](@ref), [`quadratic_diff_model`](@ref).
+"""
+function diff_model(
+    optimizer_constructor;
+    with_bridge_type = Float64,
+    with_cache_type = Float64,
+    with_outer_cache = true,
+)
+    inner = diff_optimizer(
+        optimizer_constructor;
+        with_bridge_type,
+        with_cache_type,
+        with_outer_cache,
+    )
+    return JuMP.direct_model(inner)
+end
+
+"""
+    nonlinear_diff_model(optimizer_constructor; with_bridge_type = Float64, with_cache_type = Float64, with_outer_cache = true)
+
+Create a JuMP model with a differentiable optimizer for nonlinear programs.
+The optimizer is created using `optimizer_constructor`.
+
+See also: [`conic_diff_model`](@ref), [`quadratic_diff_model`](@ref), [`diff_model`](@ref).
+"""
+function nonlinear_diff_model(
+    optimizer_constructor;
+    with_parametric_opt_interface = false,
+    with_bridge_type = Float64,
+    with_cache_type = Float64,
+    with_outer_cache = true,
+)
+    inner = diff_optimizer(
+        optimizer_constructor;
+        with_bridge_type,
+        with_cache_type,
+        with_outer_cache,
+    )
+    MOI.set(inner, ModelConstructor(), NonLinearProgram.Model)
+    return JuMP.direct_model(inner)
+end
+
+"""
+    conic_diff_model(optimizer_constructor; with_bridge_type = Float64, with_cache_type = Float64, with_outer_cache = true)
+
+Create a JuMP model with a differentiable optimizer for conic programs.
+The optimizer is created using `optimizer_constructor`.
+
+See also: [`nonlinear_diff_model`](@ref), [`quadratic_diff_model`](@ref), [`diff_model`](@ref).
+"""
+function conic_diff_model(
+    optimizer_constructor;
+    with_bridge_type = Float64,
+    with_cache_type = Float64,
+    with_outer_cache = true,
+)
+    inner = diff_optimizer(
+        optimizer_constructor;
+        with_bridge_type,
+        with_cache_type,
+        with_outer_cache,
+    )
+    MOI.set(inner, ModelConstructor(), ConicProgram.Model)
+    return JuMP.direct_model(inner)
+end
+
+"""
+    quadratic_diff_model(optimizer_constructor; with_bridge_type = Float64, with_cache_type = Float64, with_outer_cache = true)
+
+Create a JuMP model with a differentiable optimizer for quadratic programs.
+The optimizer is created using `optimizer_constructor`.
+
+See also: [`nonlinear_diff_model`](@ref), [`conic_diff_model`](@ref), [`diff_model`](@ref).
+"""
+function quadratic_diff_model(
+    optimizer_constructor;
+    with_bridge_type = Float64,
+    with_cache_type = Float64,
+    with_outer_cache = true,
+)
+    inner = diff_optimizer(
+        optimizer_constructor;
+        with_bridge_type,
+        with_cache_type,
+        with_outer_cache,
+    )
+    MOI.set(inner, ModelConstructor(), QuadraticProgram.Model)
+    return JuMP.direct_model(inner)
+end
+
+"""
+    set_forward_parameter(model::JuMP.Model, variable::JuMP.VariableRef, value::Number)
+
+Set the value of a parameter input sensitivity for forward mode.
+"""
+function set_forward_parameter(
+    model::JuMP.Model,
+    variable::JuMP.VariableRef,
+    value::Number,
+)
+    return MOI.set(
+        model,
+        ForwardConstraintSet(),
+        ParameterRef(variable),
+        Parameter(value),
+    )
+end
+
+"""
+    get_reverse_parameter(model::JuMP.Model, variable::JuMP.VariableRef)
+
+Get the value of a parameter output sensitivity for reverse mode.
+"""
+function get_reverse_parameter(model::JuMP.Model, variable::JuMP.VariableRef)
+    return MOI.get(model, ReverseConstraintSet(), ParameterRef(variable)).value
+end
+
+"""
+    set_reverse_variable(model::JuMP.Model, variable::JuMP.VariableRef, value::Number)
+
+Set the value of a variable input sensitivity for reverse mode.
+"""
+function set_reverse_variable(
+    model::JuMP.Model,
+    variable::JuMP.VariableRef,
+    value::Number,
+)
+    return MOI.set(model, ReverseVariablePrimal(), variable, value)
+end
+
+"""
+    get_forward_variable(model::JuMP.Model, variable::JuMP.VariableRef)
+
+Get the value of a variable output sensitivity for forward mode.
+"""
+function get_forward_variable(model::JuMP.Model, variable::JuMP.VariableRef)
+    return MOI.get(model, ForwardVariablePrimal(), variable)
+end
diff --git a/src/moi_wrapper.jl b/src/moi_wrapper.jl
index 5c7c7be86..d6cdca517 100644
--- a/src/moi_wrapper.jl
+++ b/src/moi_wrapper.jl
@@ -24,44 +24,60 @@ julia> model.add_constraint(model, ...)
 """
 function diff_optimizer(
     optimizer_constructor;
-    with_parametric_opt_interface::Bool = false,
     with_bridge_type = Float64,
-    with_cache::Bool = true,
+    with_cache_type = Float64,
+    with_outer_cache = true,
+    allow_parametric_opt_interface = true,
 )
-    optimizer = MOI.instantiate(optimizer_constructor; with_bridge_type)
+    optimizer = MOI.instantiate(
+        optimizer_constructor;
+        with_bridge_type,
+        with_cache_type,
+    )
+    add_poi =
+        allow_parametric_opt_interface &&
+        !MOI.supports_add_constrained_variable(
+            optimizer.model,
+            MOI.Parameter{Float64},
+        )
     # When we do `MOI.copy_to(diff, optimizer)` we need to efficiently `MOI.get`
     # the model information from `optimizer`. However, 1) `optimizer` may not
     # implement some getters or it may be inefficient and 2) the getters may be
     # unimplemented or inefficient through some bridges.
     # For this reason we add a cache layer, the same cache JuMP adds.
-    caching_opt = if with_cache
+    caching_opt = if with_outer_cache
         MOI.Utilities.CachingOptimizer(
             MOI.Utilities.UniversalFallback(
                 MOI.Utilities.Model{with_bridge_type}(),
             ),
-            optimizer,
+            add_poi ? POI.Optimizer(optimizer) : optimizer,
         )
     else
-        optimizer
-    end
-    if with_parametric_opt_interface
-        return POI.Optimizer(Optimizer(caching_opt))
-    else
-        return Optimizer(caching_opt)
+        add_poi ? POI.Optimizer(optimizer) : optimizer
     end
+    return Optimizer(caching_opt)
 end
 
 mutable struct Optimizer{OT<:MOI.ModelLike} <: MOI.AbstractOptimizer
+    # main optimizer responsible for caching the optimization problem data
+    # and for solving the optimization problem
     optimizer::OT
 
+    # list of differentiation backends for automatic differentiation
+    # without mode selection
     model_constructors::Vector{Any}
+    # used to select a single differentiation backend
+    # if not `nothing`, it is used to select the model constructor
+    # and the above is ignored
     model_constructor::Any
 
+    # instantiated differentiation backend from the options above
     diff::Any
 
+    # mapping between the `optimizer` and the `diff` models
     index_map::Union{Nothing,MOI.Utilities.IndexMap}
 
-    # sensitivity input cache using MOI like sparse format
+    # sensitivity input cache using MOI-like sparse format
     input_cache::InputCache
 
     function Optimizer(optimizer::OT) where {OT<:MOI.ModelLike}
@@ -560,6 +576,25 @@ function forward_differentiate!(model::Optimizer)
         NonLinearKKTJacobianFactorization(),
         model.input_cache.factorization,
     )
+    T = Float64
+    list = MOI.get(
+        model,
+        MOI.ListOfConstraintIndices{MOI.VariableIndex,MOI.Parameter{T}}(),
+    )
+    parametric_diff = !isempty(list)
+    if parametric_diff # MOI.supports_constraint(model, MOI.VariableIndex, MOI.Parameter{T})
+        # @show "param mode"
+        for (vi, value) in model.input_cache.parameter_constraints
+            MOI.set(
+                diff,
+                ForwardConstraintSet(),
+                model.index_map[vi],
+                MOI.Parameter(value),
+            )
+        end
+        return forward_differentiate!(diff)
+    end
+    # @show "func mode"
     if model.input_cache.objective !== nothing
         MOI.set(
             diff,
@@ -570,7 +605,9 @@ function forward_differentiate!(model::Optimizer)
             ),
         )
     end
-    for (F, S) in keys(model.input_cache.scalar_constraints.dict)
+    for (F, S) in MOI.Utilities.DoubleDicts.nonempty_outer_keys(
+        model.input_cache.scalar_constraints,
+    )
         _copy_forward_in_constraint(
             diff,
             model.index_map,
@@ -578,7 +615,9 @@ function forward_differentiate!(model::Optimizer)
             model.input_cache.scalar_constraints[F, S],
         )
     end
-    for (F, S) in keys(model.input_cache.vector_constraints.dict)
+    for (F, S) in MOI.Utilities.DoubleDicts.nonempty_outer_keys(
+        model.input_cache.vector_constraints,
+    )
         _copy_forward_in_constraint(
             diff,
             model.index_map,
@@ -586,24 +625,19 @@ function forward_differentiate!(model::Optimizer)
             model.input_cache.vector_constraints[F, S],
         )
     end
-    if model.input_cache.dp !== nothing
-        for (vi, value) in model.input_cache.dp
-            diff.model.input_cache.dp[model.index_map[vi]] = value
-        end
-    end
     return forward_differentiate!(diff)
 end
 
 function empty_input_sensitivities!(model::Optimizer)
     empty!(model.input_cache)
     if model.diff !== nothing
-        empty!(model.diff.model.input_cache)
+        empty_input_sensitivities!(model.diff)
     end
     return
 end
 
-function _instantiate_with_bridges(model_constructor)
-    model = MOI.Bridges.LazyBridgeOptimizer(MOI.instantiate(model_constructor))
+function _add_bridges(instantiated_model)
+    model = MOI.Bridges.LazyBridgeOptimizer(instantiated_model)
     # We don't add any variable bridge here because:
     # 1) If `ZerosBridge` is used, `MOI.Bridges.unbridged_function` does not work.
     #    This is in fact expected: since `ZerosBridge` drops the variable, we dont
@@ -616,14 +650,37 @@ function _instantiate_with_bridges(model_constructor)
     return model
 end
 
+function _instantiate_diff(model::Optimizer, constructor)
+    # parametric_diff = MOI.supports_constraint(
+    #     model,
+    #     MOI.VariableIndex,
+    #     MOI.Parameter{Float64},
+    # )
+    list = MOI.get(
+        model,
+        MOI.ListOfConstraintIndices{MOI.VariableIndex,MOI.Parameter{Float64}}(),
+    )
+    parametric_diff = !isempty(list)
+    model_instance = MOI.instantiate(constructor)
+    needs_poi =
+        !MOI.supports_add_constrained_variable(
+            model_instance,
+            MOI.Parameter{Float64},
+        )
+    model_bridged = _add_bridges(model_instance)
+    if needs_poi && parametric_diff
+        return POI.Optimizer(model_bridged)
+    end
+    return model_bridged
+end
+
 function _diff(model::Optimizer)
     if model.diff === nothing
         _check_termination_status(model)
         model_constructor = MOI.get(model, ModelConstructor())
         if isnothing(model_constructor)
-            model.diff = nothing
             for constructor in model.model_constructors
-                model.diff = _instantiate_with_bridges(constructor)
+                model.diff = _instantiate_diff(model, constructor)
                 try
                     model.index_map = MOI.copy_to(model.diff, model.optimizer)
                 catch err
@@ -648,7 +705,7 @@ function _diff(model::Optimizer)
                 )
             end
         else
-            model.diff = _instantiate_with_bridges(model_constructor)
+            model.diff = _instantiate_diff(model, model_constructor)
             model.index_map = MOI.copy_to(model.diff, model.optimizer)
         end
         _copy_dual(model.diff, model.optimizer, model.index_map)
@@ -688,6 +745,18 @@ end
 MOI.supports(::Optimizer, ::ForwardObjectiveFunction) = true
 
 function MOI.set(model::Optimizer, ::ForwardObjectiveFunction, objective)
+    T = Float64
+    list = MOI.get(
+        model,
+        MOI.ListOfConstraintIndices{MOI.VariableIndex,MOI.Parameter{T}}(),
+    )
+    parametric_diff = !isempty(list)
+    if parametric_diff
+        error(
+            "Cannot set forward objective function for a model with parameters. " *
+            "Use `MOI.set(model, ForwardConstraintSet(), ParameterRef(vi), Parameter(val))` instead.",
+        )
+    end
     model.input_cache.objective = objective
     return
 end
@@ -746,6 +815,14 @@ function MOI.set(
     vi::MOI.VariableIndex,
     val,
 )
+    T = Float64
+    is_param = MOI.is_valid(
+        model,
+        MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}}(vi.value),
+    )
+    if is_param
+        error("Trying to set a backward variable sensitivity for a parameter")
+    end
     model.input_cache.dx[vi] = val
     return
 end
@@ -814,6 +891,17 @@ function MOI.set(
     ci::MOI.ConstraintIndex{MOI.ScalarAffineFunction{T},S},
     func::MOI.ScalarAffineFunction{T},
 ) where {T,S}
+    list = MOI.get(
+        model,
+        MOI.ListOfConstraintIndices{MOI.VariableIndex,MOI.Parameter{T}}(),
+    )
+    parametric_diff = !isempty(list)
+    if parametric_diff
+        error(
+            "Cannot set forward constraint function for a model with parameters. " *
+            "Use `MOI.set(model, ForwardConstraintSet(), ParameterRef(vi), Parameter(val))` instead.",
+        )
+    end
     model.input_cache.scalar_constraints[ci] = func
     return
 end
@@ -824,6 +912,17 @@ function MOI.set(
     ci::MOI.ConstraintIndex{MOI.VectorAffineFunction{T},S},
     func::MOI.VectorAffineFunction{T},
 ) where {T,S}
+    list = MOI.get(
+        model,
+        MOI.ListOfConstraintIndices{MOI.VariableIndex,MOI.Parameter{T}}(),
+    )
+    parametric_diff = !isempty(list)
+    if parametric_diff
+        error(
+            "Cannot set forward constraint function for a model with parameters. " *
+            "Use `MOI.set(model, ForwardConstraintSet(), ParameterRef(vi), Parameter(val))` instead.",
+        )
+    end
     model.input_cache.vector_constraints[ci] = func
     return
 end
@@ -858,12 +957,39 @@ function MOI.get(
     end
 end
 
+function MOI.supports(
+    ::Optimizer,
+    ::ForwardConstraintSet,
+    ::Type{MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}}},
+) where {T}
+    return true
+end
+
+function MOI.get(
+    model::Optimizer,
+    ::ForwardConstraintSet,
+    ci::MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}},
+) where {T}
+    set = get(model.input_cache.parameter_constraints, ci, zero(T))
+    return MOI.Parameter{T}(set)
+end
+
+function MOI.set(
+    model::Optimizer,
+    ::ForwardConstraintSet,
+    ci::MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}},
+    set::MOI.Parameter,
+) where {T}
+    model.input_cache.parameter_constraints[ci] = set.value
+    return
+end
+
 function MOI.get(model::Optimizer, attr::DifferentiateTimeSec)
     return MOI.get(model.diff, attr)
 end
 
 function MOI.supports(
-    model::Optimizer,
+    ::Optimizer,
     ::NonLinearKKTJacobianFactorization,
     ::Function,
 )
diff --git a/src/parameters.jl b/src/parameters.jl
index 68ff54a49..b0a026904 100644
--- a/src/parameters.jl
+++ b/src/parameters.jl
@@ -10,9 +10,7 @@ MOI.supports(::POI.Optimizer, ::ForwardObjectiveFunction) = false
 function MOI.set(::POI.Optimizer, ::ForwardObjectiveFunction, _)
     return error(
         "Forward objective function is not supported when " *
-        "`with_parametric_opt_interface` is set to `true` in " *
-        "`diff_optimizer`." *
-        "Use parameters to set the forward sensitivity.",
+        "`JuMP.Parameter`s (or `MOI.Parameter`s) are present in the model.",
     )
 end
 
@@ -26,9 +24,7 @@ function MOI.set(
 )
     return error(
         "Forward constraint function is not supported when " *
-        "`with_parametric_opt_interface` is set to `true` in " *
-        "`diff_optimizer`." *
-        "Use parameters to set the forward sensitivity.",
+        "`JuMP.Parameter`s (or `MOI.Parameter`s) are present in the model.",
     )
 end
 
@@ -37,9 +33,7 @@ MOI.supports(::POI.Optimizer, ::ReverseObjectiveFunction) = false
 function MOI.get(::POI.Optimizer, ::ReverseObjectiveFunction)
     return error(
         "Reverse objective function is not supported when " *
-        "`with_parametric_opt_interface` is set to `true` in " *
-        "`diff_optimizer`." *
-        "Use parameters to get the reverse sensitivity.",
+        "`JuMP.Parameter`s (or `MOI.Parameter`s) are present in the model.",
     )
 end
 
@@ -52,9 +46,7 @@ function MOI.get(
 )
     return error(
         "Reverse constraint function is not supported when " *
-        "`with_parametric_opt_interface` is set to `true` in " *
-        "`diff_optimizer`." *
-        "Use parameters to get the reverse sensitivity.",
+        "`JuMP.Parameter`s (or `MOI.Parameter`s) are present in the model.",
     )
 end
 
@@ -315,16 +307,6 @@ function MOI.set(
     return
 end
 
-function MOI.set(
-    model::Optimizer,
-    ::ForwardConstraintSet,
-    ci::MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{T}},
-    set::MOI.Parameter,
-) where {T}
-    model.input_cache.dp[ci] = set.value
-    return
-end
-
 function MOI.get(
     model::POI.Optimizer,
     attr::ForwardVariablePrimal,
@@ -567,8 +549,6 @@ function MOI.set(
     return
 end
 
-MOI.is_set_by_optimize(::ReverseConstraintSet) = true
-
 function MOI.get(
     model::POI.Optimizer,
     ::ReverseConstraintSet,
diff --git a/test/Project.toml b/test/Project.toml
index e7c5a5b8d..b7775fc19 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,6 +2,7 @@
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DiffOpt = "930fe3bc-9c6b-11ea-2d94-6184641e85e7"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
@@ -10,13 +11,14 @@ JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+ParametricOptInterface = "0ce4ce61-57bf-432b-a095-efac525d185e"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 
 [compat]
 HiGHS = "1"
diff --git a/test/conic_program.jl b/test/conic_program.jl
index a3d4a9398..2b4a20aa5 100644
--- a/test/conic_program.jl
+++ b/test/conic_program.jl
@@ -11,6 +11,7 @@ import Ipopt
 import LinearAlgebra
 import MathOptInterface as MOI
 import SCS
+using JuMP
 
 const ATOL = 2e-4
 const RTOL = 2e-4
@@ -28,7 +29,7 @@ end
 
 function _test_simple_socp(eq_vec::Bool)
     # referred from _soc2test, https://github.com/jump-dev/MathOptInterface.jl/blob/master/src/Test/contconic.jl#L1355
-    # find equivalent diffcp python program here: https://github.com/AKS1996/jump-gsoc-2020/blob/master/diffcp_socp_1_py.ipynb
+    # find reference diffcp python program here: https://github.com/AKS1996/jump-gsoc-2020/blob/master/diffcp_socp_1_py.ipynb
     # Problem SOC2
     # min  x
     # s.t. y ≥ 1/√2
@@ -38,80 +39,103 @@ function _test_simple_socp(eq_vec::Bool)
     # s.t.  -1/√2 + y ∈ R₊
     #        1 - t ∈ {0}
     #      (t,x,y) ∈ SOC₃
-    model = DiffOpt.diff_optimizer(SCS.Optimizer)
-    MOI.set(model, MOI.Silent(), true)
-    x, y, t = MOI.add_variables(model, 3)
-    MOI.set(
-        model,
-        MOI.ObjectiveFunction{MOI.ScalarAffineFunction{Float64}}(),
-        1.0x,
-    )
-    MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE)
+
+    model = JuMP.Model(() -> DiffOpt.diff_optimizer(SCS.Optimizer))
+    set_silent(model)
+
+    x = @variable(model)
+    y = @variable(model)
+    t = @variable(model)
+
+    ceq = if eq_vec
+        @constraint(model, [t] .== [1.0])
+    else
+        @constraint(model, t == 1.0)
+    end
+    cnon = @constraint(model, 1.0y >= 1 / √2)
+    csoc = @constraint(model, [1.0t, 1.0x, 1.0y] in MOI.SecondOrderCone(3))
+
+    @objective(model, Min, 1.0x)
+
+    optimize!(model)
+
+    # set foward sensitivities
     if eq_vec
-        ceq = MOI.add_constraint(
-            model,
-            MOI.Utilities.vectorize([-1.0t + 1.0]),
-            MOI.Zeros(1),
-        )
+        MOI.set.(model, DiffOpt.ForwardConstraintFunction(), ceq, [1.0 * x])
     else
-        ceq = MOI.add_constraint(model, -1.0t, MOI.EqualTo(-1.0))
+        MOI.set(model, DiffOpt.ForwardConstraintFunction(), ceq, 1.0 * x)
     end
-    cnon = MOI.add_constraint(
-        model,
-        MOI.Utilities.vectorize([1.0y - 1 / √2]),
-        MOI.Nonnegatives(1),
-    )
-    csoc = MOI.add_constraint(
-        model,
-        MOI.Utilities.vectorize([1.0t, 1.0x, 1.0y]),
-        MOI.SecondOrderCone(3),
-    )
-    MOI.optimize!(model)
+
+    DiffOpt.forward_differentiate!(model)
+
+    dx = -0.9999908
+    @test MOI.get(model, DiffOpt.ForwardVariablePrimal(), x) ≈ dx atol = ATOL rtol =
+        RTOL
+
+    MOI.set(model, DiffOpt.ReverseVariablePrimal(), x, 1.0)
+
+    DiffOpt.reverse_differentiate!(model)
+
     if eq_vec
-        MOI.set(
-            model,
-            DiffOpt.ForwardConstraintFunction(),
-            ceq,
-            MOI.Utilities.vectorize([1.0 * x]),
+        @test all(
+            isapprox.(
+                JuMP.coefficient.(
+                    MOI.get.(model, DiffOpt.ReverseConstraintFunction(), ceq),
+                    x,
+                ),
+                dx,
+                atol = ATOL,
+                rtol = RTOL,
+            ),
         )
     else
-        MOI.set(model, DiffOpt.ForwardConstraintFunction(), ceq, 1.0 * x)
+        @test JuMP.coefficient(
+            MOI.get(model, DiffOpt.ReverseConstraintFunction(), ceq),
+            x,
+        ) ≈ dx atol = ATOL rtol = RTOL
     end
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        cnon,
-        MOI.Utilities.vectorize([1.0 * y]),
-    )
+
+    DiffOpt.empty_input_sensitivities!(model)
+
+    MOI.set(model, DiffOpt.ForwardConstraintFunction(), cnon, 1.0 * y)
+
+    DiffOpt.forward_differentiate!(model)
+
+    dy = -0.707083
+    @test MOI.get(model, DiffOpt.ForwardVariablePrimal(), y) ≈ dy atol = ATOL rtol =
+        RTOL
+
+    MOI.set(model, DiffOpt.ReverseVariablePrimal(), y, 1.0)
+
+    DiffOpt.reverse_differentiate!(model)
+
+    @test JuMP.coefficient(
+        MOI.get(model, DiffOpt.ReverseConstraintFunction(), cnon),
+        y,
+    ) ≈ dy atol = ATOL rtol = RTOL
+
+    DiffOpt.empty_input_sensitivities!(model)
+
     MOI.set(
         model,
         DiffOpt.ForwardConstraintFunction(),
         csoc,
-        MOI.Utilities.operate(vcat, Float64, 1.0 * t, 0.0, 0.0),
+        MOI.Utilities.operate(vcat, Float64, 1.0 * t.index, 0.0, 0.0),
     )
+
     DiffOpt.forward_differentiate!(model)
-    # these matrices are benchmarked with the output generated by diffcp
-    # refer the python file mentioned above to get equivalent python source code
-    @test model.diff.model.x ≈ [-1 / √2; 1 / √2; 1.0] atol = ATOL rtol = RTOL
-    if eq_vec
-        @test model.diff.model.s ≈ [0.0, 0.0, 1.0, -1 / √2, 1 / √2] atol = ATOL rtol =
-            RTOL
-        @test model.diff.model.y ≈ [√2, 1.0, √2, 1.0, -1.0] atol = ATOL rtol =
-            RTOL
-    else
-        @test model.diff.model.s ≈ [0.0, 1.0, -1 / √2, 1 / √2, 0.0] atol = ATOL rtol =
-            RTOL
-        @test model.diff.model.y ≈ [1.0, √2, 1.0, -1.0, √2] atol = ATOL rtol =
-            RTOL
-    end
-    dx = [1.12132144; 1 / √2; 1 / √2]
-    for (i, vi) in enumerate([x, y, t])
-        @test dx[i] ≈ MOI.get(model, DiffOpt.ForwardVariablePrimal(), vi) atol =
-            ATOL rtol = RTOL
-    end
-    # @test dx ≈ [1.12132144; 1/√2; 1/√2] atol=ATOL rtol=RTOL
-    # @test ds ≈ [0.0; 0.0; -2.92893438e-01;  1.12132144e+00; 7.07106999e-01]  atol=ATOL rtol=RTOL
-    # @test dy ≈ [2.4142175; 5.00000557; 3.8284315; √2; -4.00000495] atol=ATOL rtol=RTOL
+
+    ds = 0.0
+    @test MOI.get(model, DiffOpt.ForwardVariablePrimal(), t) ≈ ds atol = ATOL rtol =
+        RTOL
+
+    MOI.set(model, DiffOpt.ReverseVariablePrimal(), t, 1.0)
+
+    DiffOpt.reverse_differentiate!(model)
+
+    # FIXME: this is not working - https://github.com/jump-dev/DiffOpt.jl/issues/283
+    # @test JuMP.coefficient(MOI.get(model, DiffOpt.ReverseConstraintFunction(), csoc).func.func.func, t.index) ≈ ds atol=ATOL rtol=RTOL
+
     return
 end
 
@@ -375,206 +399,82 @@ function test_differentiating_conic_with_PSD_and_SOC_constraints()
     return
 end
 
-function test_differentiating_conic_with_PSD_and_POS_constraints()
+function _build_simple_sdp()
     # refer psdt2test, https://github.com/jump-dev/MathOptInterface.jl/blob/master/src/Test/contconic.jl#L4306
     # find equivalent diffcp program here - https://github.com/AKS1996/jump-gsoc-2020/blob/master/diffcp_sdp_3_py.ipynb
-    model = DiffOpt.diff_optimizer(SCS.Optimizer)
-    MOI.set(model, MOI.Silent(), true)
-    x = MOI.add_variables(model, 7)
-    @test MOI.get(model, MOI.NumberOfVariables()) == 7
-    η = 10.0
-    c1 = MOI.add_constraint(
-        model,
-        MOI.VectorAffineFunction(
-            MOI.VectorAffineTerm.(1, MOI.ScalarAffineTerm.(-1.0, x[1:6])),
-            [η],
-        ),
-        MOI.Nonnegatives(1),
-    )
-    c2 = MOI.add_constraint(
-        model,
-        MOI.VectorAffineFunction(
-            MOI.VectorAffineTerm.(1:6, MOI.ScalarAffineTerm.(1.0, x[1:6])),
-            zeros(6),
-        ),
-        MOI.Nonnegatives(6),
-    )
-    α = 0.8
-    δ = 0.9
-    c3 = MOI.add_constraint(
-        model,
-        MOI.VectorAffineFunction(
-            MOI.VectorAffineTerm.(
-                [fill(1, 7); fill(2, 5); fill(3, 6)],
-                MOI.ScalarAffineTerm.(
-                    [
-                        δ / 2,
-                        α,
-                        δ,
-                        δ / 4,
-                        δ / 8,
-                        0.0,
-                        -1.0,
-                        -δ / (2 * √2),
-                        -δ / 4,
-                        0,
-                        -δ / (8 * √2),
-                        0.0,
-                        δ / 2,
-                        δ - α,
-                        0,
-                        δ / 8,
-                        δ / 4,
-                        -1.0,
-                    ],
-                    [x[1:7]; x[1:3]; x[5:6]; x[1:3]; x[5:7]],
-                ),
-            ),
-            zeros(3),
-        ),
-        MOI.PositiveSemidefiniteConeTriangle(2),
-    )
-    c4 = MOI.add_constraint(
-        model,
-        MOI.VectorAffineFunction(
-            MOI.VectorAffineTerm.(
-                1,
-                MOI.ScalarAffineTerm.(0.0, [x[1:3]; x[5:6]]),
-            ),
-            [0.0],
-        ),
-        MOI.Zeros(1),
-    )
-    MOI.set(
-        model,
-        MOI.ObjectiveFunction{MOI.ScalarAffineFunction{Float64}}(),
-        MOI.ScalarAffineFunction([MOI.ScalarAffineTerm(1.0, x[7])], 0.0),
-    )
-    MOI.set(model, MOI.ObjectiveSense(), MOI.MAX_SENSE)
-    MOI.optimize!(model)
-    # dc = ones(7)
-    MOI.set(
-        model,
-        DiffOpt.ForwardObjectiveFunction(),
-        MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(ones(7), x), 0.0),
-    )
-    # db = ones(11)
-    # dA = ones(11, 7)
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c1,
-        MOI.Utilities.vectorize(ones(1, 7) * x + ones(1)),
-    )
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c2,
-        MOI.Utilities.vectorize(ones(6, 7) * x + ones(6)),
-    )
-    MOI.set(
+    # Make a JuMP model backed by DiffOpt.diff_optimizer(SCS.Optimizer)
+    model = Model(() -> DiffOpt.diff_optimizer(SCS.Optimizer))
+    set_silent(model)  # just to suppress solver output
+
+    @variable(model, x[1:3])
+
+    @constraint(model, c1, sum(x[i] for i in 1:3) == 4)
+
+    @constraint(model, c2[i = 1:3], x[i] ≥ 0)
+
+    @constraint(model, x[1] == 2)
+
+    @constraint(
         model,
-        DiffOpt.ForwardConstraintFunction(),
         c3,
-        MOI.Utilities.vectorize(ones(3, 7) * x + ones(3)),
-    )
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c4,
-        MOI.Utilities.vectorize(ones(1, 7) * x + ones(1)),
+        LinearAlgebra.Symmetric([
+            x[3]+1 2
+            2 2x[3]+2
+        ]) in PSDCone()
     )
-    DiffOpt.forward_differentiate!(model)
-    @test model.diff.model.x ≈
-          [20 / 3.0, 0.0, 10 / 3.0, 0.0, 0.0, 0.0, 1.90192379] atol = ATOL rtol =
-        RTOL
-    @test model.diff.model.s ≈ [
-        0.0,
-        0.0,
-        20 / 3.0,
-        0.0,
-        10 / 3.0,
-        0.0,
-        0.0,
-        0.0,
-        4.09807621,
-        -2.12132,
-        1.09807621,
-    ] atol = ATOL rtol = RTOL
-    @test model.diff.model.y ≈ [
-        0.0,
-        0.19019238,
-        0.0,
-        0.12597667,
-        0.0,
-        0.14264428,
-        0.14264428,
-        0.01274047,
-        0.21132487,
-        0.408248,
-        0.78867513,
-    ] atol = ATOL rtol = RTOL
-    atol = 0.3
-    rtol = 0.01
-    # compare these with https://github.com/AKS1996/jump-gsoc-2020/blob/master/diffcp_sdp_3_py.ipynb
-    # results are not exactly as: 1. there is some residual error   2. diffcp results are SCS specific, hence scaled
-    dx = [-39.6066, 10.8953, -14.9189, 10.9054, 10.883, 10.9118, -21.7508]
-    for (i, vi) in enumerate(x)
-        @test dx[i] ≈ MOI.get(model, DiffOpt.ForwardVariablePrimal(), vi) atol =
-            atol rtol = rtol
+
+    @objective(model, Min, 4x[3] + x[2])
+    return model
+end
+
+function test_differentiating_conic_with_PSD_constraints()
+    model = _build_simple_sdp()
+    optimize!(model)
+    x = model[:x]
+    c1 = model[:c1]
+    c2 = model[:c2]
+    sx = value.(x)
+    @test sx ≈ [2.0, 3.0 - sqrt(2), sqrt(2) - 1] atol = ATOL rtol = RTOL
+
+    for i in 1:3
+        _model = _build_simple_sdp()
+        JuMP.set_normalized_coefficient(_model[:c1], _model[:x][i], 1.001)
+        optimize!(_model)
+        _dx = (value(_model[:x][i]) - value(sx[i])) / 0.001
+        i in (1, 3) ? (@test abs(_dx) < 0.05) : (@test -1.6 < _dx < -1.45)
+        MOI.set(model, DiffOpt.ForwardConstraintFunction(), c1, x[i] + 0.0)
+        DiffOpt.forward_differentiate!(model)
+        _dx = MOI.get(model, DiffOpt.ForwardVariablePrimal(), x[i])
+        i in (1, 3) ? (@test abs(_dx) < 0.05) : (@test -1.6 < _dx < -1.45)
+        MOI.set(model, DiffOpt.ReverseVariablePrimal(), x[i], 1.0)
+        DiffOpt.reverse_differentiate!(model)
+        _dx = JuMP.coefficient(
+            MOI.get(model, DiffOpt.ReverseConstraintFunction(), c1),
+            x[i],
+        )
+        i in (1, 3) ? (@test abs(_dx) < 0.05) : (@test -1.6 < _dx < -1.45)
+        DiffOpt.empty_input_sensitivities!(model)
     end
-    # @test dy ≈ [0.0, -3.56905, 0.0, -0.380035, 0.0, -0.41398, -0.385321, -0.00743119, -0.644986, -0.550542, -2.36765] atol=atol rtol=rtol
-    # @test ds ≈ [0.0, 0.0, -50.4973, 0.0, -25.8066, 0.0, 0.0, 0.0, -7.96528, -1.62968, -2.18925] atol=atol rtol=rtol
-    # TODO: future example, how to differentiate wrt a specific constraint/variable, refer QPLib article for more
-    dA = zeros(11, 7)
-    dA[3:8, 1:6] = Matrix{Float64}(LinearAlgebra.I, 6, 6)  # differentiating only wrt POS constraint c2
-    db = zeros(11)
-    dc = zeros(7)
-    # db = zeros(11)
-    # dA = zeros(11, 7)
-    # dA[3:8, 1:6] = Matrix{Float64}(LinearAlgebra.I, 6, 6)  # differentiating only wrt POS constraint c2
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c1,
-        MOI.Utilities.zero_with_output_dimension(
-            MOI.VectorAffineFunction{Float64},
-            1,
-        ),
-    )
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c2,
-        MOI.Utilities.vectorize(ones(6) .* x[1:6]),
-    )
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c3,
-        MOI.Utilities.zero_with_output_dimension(
-            MOI.VectorAffineFunction{Float64},
-            3,
-        ),
-    )
-    MOI.set(
-        model,
-        DiffOpt.ForwardConstraintFunction(),
-        c4,
-        MOI.Utilities.zero_with_output_dimension(
-            MOI.VectorAffineFunction{Float64},
-            1,
-        ),
-    )
-    DiffOpt.forward_differentiate!(model)
-    # for (i, vi) in enumerate(X)
-    #     @test 0.0 ≈ MOI.get(model,
-    #         DiffOpt.ForwardVariablePrimal(), vi) atol=1e-2 rtol=RTOL
-    # end
-    # TODO add a test here, probably on duals
-    # # note that there's no change in the PSD slack values or dual optimas
-    # @test dy ≈ [0.0, 0.0, 0.0, 0.125978, 0.0, 0.142644, 0.142641, 0.0127401, 0.0, 0.0, 0.0] atol=atol rtol=RTOL
-    # @test ds ≈ [0.0, 0.0, -6.66672, 0.0, -3.33336, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] atol=atol rtol=RTOL
+    for i in 1:3
+        DiffOpt.empty_input_sensitivities!(model)
+        _model = _build_simple_sdp()
+        JuMP.set_normalized_coefficient(_model[:c2][i], _model[:x][i], 1.001)
+        optimize!(_model)
+        _dx = (value(_model[:x][i]) - value(sx[i])) / 0.001
+        @test abs(_dx) < 0.15
+        MOI.set(model, DiffOpt.ForwardConstraintFunction(), c2[i], x[i] + 0.0)
+        DiffOpt.forward_differentiate!(model)
+        _dx = MOI.get(model, DiffOpt.ForwardVariablePrimal(), x[i])
+        @test abs(_dx) < 0.15
+        MOI.set(model, DiffOpt.ReverseVariablePrimal(), x[i], 1.0)
+        DiffOpt.reverse_differentiate!(model)
+        _dx = JuMP.coefficient(
+            MOI.get(model, DiffOpt.ReverseConstraintFunction(), c2[i]),
+            x[i],
+        )
+        @test abs(_dx) < 0.15
+    end
+
     return
 end
 
@@ -616,7 +516,6 @@ function test_differentiating_a_simple_psd()
     # dc = zeros(1)
     DiffOpt.forward_differentiate!(model)
     @test model.diff.model.x ≈ [1.0] atol = 10ATOL rtol = 10RTOL
-    @test model.diff.model.s ≈ ones(6) atol = ATOL rtol = RTOL
     @test model.diff.model.y ≈ [1 / 3, -1 / 6, 1 / 3, -1 / 6, -1 / 6, 1 / 3] atol =
         ATOL rtol = RTOL
     @test -0.5 ≈ MOI.get(model, DiffOpt.ForwardVariablePrimal(), x) atol = 1e-2 rtol =
@@ -763,7 +662,6 @@ function test_verifying_cache_on_a_psd()
     dc = zeros(1)
     DiffOpt.forward_differentiate!(model)
     @test model.diff.model.x ≈ [1.0] atol = 10ATOL rtol = 10RTOL
-    @test model.diff.model.s ≈ ones(6) atol = ATOL rtol = RTOL
     @test model.diff.model.y ≈ [1 / 3, -1 / 6, 1 / 3, -1 / 6, -1 / 6, 1 / 3] atol =
         ATOL rtol = RTOL
     @test -0.5 ≈ MOI.get(model, DiffOpt.ForwardVariablePrimal(), x) atol = 1e-2 rtol =
@@ -924,6 +822,25 @@ function test_quad_to_soc()
     return
 end
 
+function test_jump_psd_cone_with_parameter_pv_v_pv()
+    model = DiffOpt.conic_diff_model(SCS.Optimizer)
+    @variable(model, x)
+    @variable(model, p in MOI.Parameter(1.0))
+    @constraint(
+        model,
+        con,
+        [p * x, (2 * x - 3), p * 3 * x] in
+        MOI.PositiveSemidefiniteConeTriangle(2)
+    )
+    @objective(model, Min, x)
+    optimize!(model)
+    direction_p = 2.0
+    DiffOpt.set_forward_parameter(model, p, direction_p)
+    DiffOpt.forward_differentiate!(model)
+    dx = MOI.get(model, DiffOpt.ForwardVariablePrimal(), x)
+    @test dx ≈ 0.0 atol = 1e-4 rtol = 1e-4
+end
+
 end  # module
 
 TestConicProgram.runtests()
diff --git a/test/data/nlp_problems.jl b/test/data/nlp_problems.jl
index 923b48c4c..2769ae82c 100644
--- a/test/data/nlp_problems.jl
+++ b/test/data/nlp_problems.jl
@@ -9,6 +9,7 @@ https://github.com/jump-dev/JuMP.jl/blob/301d46e81cb66c74c6e22cd89fb89ced740f157
 ################################################
 function create_nonlinear_jump_model(; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
     @variable(model, p ∈ MOI.Parameter(1.0))
     @variable(model, p2 ∈ MOI.Parameter(2.0))
@@ -33,6 +34,7 @@ From sIpopt paper: https://optimization-online.org/2011/04/3008/
 
 function create_nonlinear_jump_model_sipopt(; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
     @variable(model, p1 ∈ MOI.Parameter(4.5))
     @variable(model, p2 ∈ MOI.Parameter(1.0))
@@ -55,6 +57,7 @@ Simple Problems
 
 function create_jump_model_1(p_val = [1.5])
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -73,6 +76,7 @@ end
 
 function create_jump_model_2(p_val = [1.5])
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -90,6 +94,7 @@ end
 
 function create_jump_model_3(p_val = [-1.5])
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -108,6 +113,7 @@ end
 
 function create_jump_model_4(p_val = [1.5])
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -126,6 +132,7 @@ end
 
 function create_jump_model_5(p_val = [1.5])
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -147,6 +154,7 @@ h(y) = -sum(y .* log.(y))
 softmax(x) = exp.(x) / sum(exp.(x))
 function create_jump_model_6(p_a = collect(1.0:0.1:2.0))
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -167,6 +175,7 @@ end
 
 function create_jump_model_7(p_val = [1.5], g = sin)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -190,6 +199,7 @@ Non Linear Problems
 
 function create_nonlinear_jump_model_1(p_val = [1.0; 2.0; 100]; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -214,6 +224,7 @@ end
 
 function create_nonlinear_jump_model_2(p_val = [3.0; 2.0; 10]; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -238,6 +249,7 @@ end
 
 function create_nonlinear_jump_model_3(p_val = [3.0; 2.0; 10]; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -261,6 +273,7 @@ end
 
 function create_nonlinear_jump_model_4(p_val = [1.0; 2.0; 100]; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -286,6 +299,7 @@ end
 
 function create_nonlinear_jump_model_5(p_val = [1.0; 2.0; 100]; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
@@ -312,6 +326,7 @@ end
 
 function create_nonlinear_jump_model_6(p_val = [100.0; 200.0]; ismin = true)
     model = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     set_silent(model)
 
     # Parameters
diff --git a/test/jump.jl b/test/jump.jl
index bdb031b73..248ceb6ef 100644
--- a/test/jump.jl
+++ b/test/jump.jl
@@ -680,6 +680,62 @@ function test_sensitivity_index_issue()
     return
 end
 
+function test_conic_supports()
+    model = DiffOpt.conic_diff_model(SCS.Optimizer)
+    @test MOI.supports(backend(model), DiffOpt.ForwardObjectiveFunction())
+    @test MOI.supports(
+        backend(model),
+        DiffOpt.ForwardConstraintSet(),
+        MOI.ConstraintIndex{MOI.VariableIndex,MOI.Parameter{Float64}},
+    )
+    function some end
+    @test MOI.supports(
+        backend(model),
+        DiffOpt.NonLinearKKTJacobianFactorization(),
+        some,
+    )
+    MOI.is_set_by_optimize(DiffOpt.ReverseConstraintFunction())
+    MOI.is_set_by_optimize(DiffOpt.ReverseConstraintSet())
+    model = DiffOpt.ConicProgram.Model()
+    @test !MOI.supports(model, MOI.Name())
+    @test MOI.supports_incremental_interface(model)
+    return
+end
+
+function test_conic_feasibility()
+    model = DiffOpt.conic_diff_model(SCS.Optimizer)
+    set_silent(model)
+
+    @variable(model, x)
+    @variable(model, p in Parameter(1.0))
+
+    @constraint(model, con, [0.0, x, p * x] in SecondOrderCone())
+
+    set_objective_sense(model, MOI.FEASIBILITY_SENSE)
+
+    optimize!(model)
+
+    DiffOpt.set_forward_parameter(model, p, 1.0)
+    DiffOpt.forward_differentiate!(model)
+    return
+end
+
+function test_psd_square_error()
+    model = DiffOpt.conic_diff_model(SCS.Optimizer)
+    set_silent(model)
+
+    @variable(model, x)
+    @variable(model, p in Parameter(1.0))
+
+    @constraint(model, con, [-p*x 0; 0 x] in PSDCone())
+
+    @test_throws MOI.SetAttributeNotAllowed optimize!(model)
+
+    # DiffOpt.set_forward_parameter(model, p, 1.0)
+    # DiffOpt.forward_differentiate!(model)
+    return
+end
+
 end  # module
 
 TestJuMP.runtests()
diff --git a/test/jump_wrapper.jl b/test/jump_wrapper.jl
new file mode 100644
index 000000000..68d7be386
--- /dev/null
+++ b/test/jump_wrapper.jl
@@ -0,0 +1,128 @@
+# Copyright (c) 2020: Akshay Sharma and contributors
+#
+# Use of this source code is governed by an MIT-style license that can be found
+# in the LICENSE.md file or at https://opensource.org/licenses/MIT.
+
+module TestJuMPWrapper
+
+using Test
+using JuMP
+import DiffOpt
+import HiGHS
+import Ipopt
+import SCS
+import ParametricOptInterface as POI
+import MathOptInterface as MOI
+import ParametricOptInterface as POI
+
+const ATOL = 1e-3
+const RTOL = 1e-3
+
+function runtests()
+    for name in names(@__MODULE__; all = true)
+        if startswith("$name", "test_")
+            @testset "$(name)" begin
+                getfield(@__MODULE__, name)()
+            end
+        end
+    end
+    return
+end
+
+function test_jump_api()
+    for (MODEL, SOLVER) in [
+            (DiffOpt.diff_model, HiGHS.Optimizer),
+            (DiffOpt.diff_model, SCS.Optimizer),
+            (DiffOpt.diff_model, Ipopt.Optimizer),
+            (DiffOpt.quadratic_diff_model, HiGHS.Optimizer),
+            (DiffOpt.quadratic_diff_model, SCS.Optimizer),
+            (DiffOpt.quadratic_diff_model, Ipopt.Optimizer),
+            (DiffOpt.conic_diff_model, HiGHS.Optimizer),
+            (DiffOpt.conic_diff_model, SCS.Optimizer),
+            (DiffOpt.conic_diff_model, Ipopt.Optimizer),
+            (DiffOpt.nonlinear_diff_model, HiGHS.Optimizer),
+            (DiffOpt.nonlinear_diff_model, SCS.Optimizer),
+            (DiffOpt.nonlinear_diff_model, Ipopt.Optimizer),
+        ],
+        ineq in [true, false],
+        _min in [true, false],
+        flip in [true, false]
+
+        @testset "$(MODEL) with: $(SOLVER), $(ineq ? "ineqs" : "eqs"), $(_min ? "Min" : "Max"), $(flip ? "geq" : "leq")" begin
+            model = MODEL(SOLVER)
+            set_silent(model)
+
+            p_val = 4.0
+            pc_val = 2.0
+            @variable(model, x)
+            @variable(model, p in Parameter(p_val))
+            @variable(model, pc in Parameter(pc_val))
+            if ineq
+                if !flip
+                    cons = @constraint(model, con, pc * x >= 3 * p)
+                else
+                    cons = @constraint(model, con, pc * x <= 3 * p)
+                end
+            else
+                cons = @constraint(model, con, pc * x == 3 * p)
+            end
+            sign = flip ? -1 : 1
+            if _min
+                @objective(model, Min, 2x * sign)
+            else
+                @objective(model, Max, -2x * sign)
+            end
+            optimize!(model)
+            @test value(x) ≈ 3 * p_val / pc_val atol = ATOL rtol = RTOL
+
+            # the function is
+            # x(p, pc) = 3p / pc
+            # hence,
+            # dx/dp = 3 / pc
+            # dx/dpc = -3p / pc^2
+
+            # First, try forward mode AD
+
+            # differentiate w.r.t. p
+            direction_p = 3.0
+            DiffOpt.set_forward_parameter(model, p, direction_p)
+            DiffOpt.forward_differentiate!(model)
+            @test DiffOpt.get_forward_variable(model, x) ≈
+                  direction_p * 3 / pc_val atol = ATOL rtol = RTOL
+
+            # update p and pc
+            p_val = 2.0
+            pc_val = 6.0
+            set_parameter_value(p, p_val)
+            set_parameter_value(pc, pc_val)
+            # re-optimize
+            optimize!(model)
+            # check solution
+            @test value(x) ≈ 3 * p_val / pc_val atol = ATOL rtol = RTOL
+
+            # stop differentiating with respect to p
+            DiffOpt.empty_input_sensitivities!(model)
+            # differentiate w.r.t. pc
+            direction_pc = 10.0
+            DiffOpt.set_forward_parameter(model, pc, direction_pc)
+            DiffOpt.forward_differentiate!(model)
+            @test DiffOpt.get_forward_variable(model, x) ≈
+                  -direction_pc * 3 * p_val / pc_val^2 atol = ATOL rtol = RTOL
+
+            # always a good practice to clear previously set sensitivities
+            DiffOpt.empty_input_sensitivities!(model)
+            # Now, reverse model AD
+            direction_x = 10.0
+            DiffOpt.set_reverse_variable(model, x, direction_x)
+            DiffOpt.reverse_differentiate!(model)
+            @test DiffOpt.get_reverse_parameter(model, p) ≈
+                  direction_x * 3 / pc_val atol = ATOL rtol = RTOL
+            @test DiffOpt.get_reverse_parameter(model, pc) ≈
+                  -direction_x * 3 * p_val / pc_val^2 atol = ATOL rtol = RTOL
+        end
+    end
+end
+
+end # module
+
+TestJuMPWrapper.runtests()
diff --git a/test/moi_wrapper.jl b/test/moi_wrapper.jl
index 6c60c2658..c9daf8513 100644
--- a/test/moi_wrapper.jl
+++ b/test/moi_wrapper.jl
@@ -28,10 +28,10 @@ end
 function test_moi_test_runtests()
     model = DiffOpt.diff_optimizer(HiGHS.Optimizer)
     # `Variable.ZerosBridge` makes dual needed by some tests fail.
-    MOI.Bridges.remove_bridge(
-        model.optimizer.optimizer,
-        MOI.Bridges.Variable.ZerosBridge{Float64},
-    )
+    # MOI.Bridges.remove_bridge(
+    #     model.optimizer.optimizer.optimizer,
+    #     MOI.Bridges.Variable.ZerosBridge{Float64},
+    # )
     MOI.set(model, MOI.Silent(), true)
     config =
         MOI.Test.Config(; atol = 1e-7, exclude = Any[MOI.compute_conflict!])
diff --git a/test/nlp_program.jl b/test/nlp_program.jl
index 862050035..bc54fd163 100644
--- a/test/nlp_program.jl
+++ b/test/nlp_program.jl
@@ -120,12 +120,8 @@ end
 
 function test_analytical_simple(; P = 2) # Number of parameters
     @testset "Bounds Bounds" begin
-        m = Model(
-            () -> DiffOpt.diff_optimizer(
-                Ipopt.Optimizer;
-                with_parametric_opt_interface = false,
-            ),
-        )
+        m = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+        MOI.set(m, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
 
         @variable(m, 0 ≤ x[1:P] ≤ 1)
         @variable(m, p[1:P] ∈ Parameter.(0.5))
@@ -194,12 +190,8 @@ function test_analytical_simple(; P = 2) # Number of parameters
         )
     end
     @testset "Bounds as RHS constraints" begin
-        m = Model(
-            () -> DiffOpt.diff_optimizer(
-                Ipopt.Optimizer;
-                with_parametric_opt_interface = false,
-            ),
-        )
+        m = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+        MOI.set(m, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
 
         @variable(m, x[1:P])
         @constraint(m, x .≥ 0)
@@ -238,12 +230,8 @@ function test_analytical_simple(; P = 2) # Number of parameters
         )
     end
     @testset "Bounds as Mixed constraints" begin
-        m = Model(
-            () -> DiffOpt.diff_optimizer(
-                Ipopt.Optimizer;
-                with_parametric_opt_interface = false,
-            ),
-        )
+        m = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+        MOI.set(m, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
 
         @variable(m, x[1:P])
         @constraint(m, 0 .≤ x)
@@ -282,12 +270,8 @@ function test_analytical_simple(; P = 2) # Number of parameters
         )
     end
     @testset "Bounds as LHS constraints" begin
-        m = Model(
-            () -> DiffOpt.diff_optimizer(
-                Ipopt.Optimizer;
-                with_parametric_opt_interface = false,
-            ),
-        )
+        m = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+        MOI.set(m, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
 
         @variable(m, x[1:P])
         @constraint(m, 0 .≤ x)
@@ -664,6 +648,7 @@ function test_differentiating_non_trivial_convex_qp_jump()
     h = vec(h)
     b = vec(b)
     model = JuMP.Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(model, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
     MOI.set(model, MOI.Silent(), true)
     @variable(model, x[1:nz])
     @variable(model, p_le[1:nineq_le] ∈ MOI.Parameter.(0.0))
@@ -709,12 +694,8 @@ function test_differentiating_non_trivial_convex_qp_jump()
 end
 
 function test_ReverseConstraintDual()
-    m = Model(
-        () -> DiffOpt.diff_optimizer(
-            Ipopt.Optimizer;
-            with_parametric_opt_interface = false,
-        ),
-    )
+    m = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(m, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
 
     @variable(m, x[1:2])
     @variable(m, p[1:2] ∈ Parameter.(0.5))
@@ -796,12 +777,8 @@ end
 
 function test_changing_factorization()
     P = 2
-    m = Model(
-        () -> DiffOpt.diff_optimizer(
-            Ipopt.Optimizer;
-            with_parametric_opt_interface = false,
-        ),
-    )
+    m = Model(() -> DiffOpt.diff_optimizer(Ipopt.Optimizer))
+    MOI.set(m, DiffOpt.ModelConstructor(), DiffOpt.NonLinearProgram.Model)
 
     @variable(m, x[1:P])
     @constraint(m, x .≥ 0)
diff --git a/test/parameters.jl b/test/parameters.jl
index b38f84c07..9b4f1968d 100644
--- a/test/parameters.jl
+++ b/test/parameters.jl
@@ -30,12 +30,7 @@ function runtests()
 end
 
 function test_diff_rhs()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     @variable(model, x)
     @variable(model, p in Parameter(3.0))
@@ -101,12 +96,7 @@ function test_diff_rhs()
 end
 
 function test_diff_vector_rhs()
-    model = direct_model(
-        DiffOpt.diff_optimizer(
-            SCS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = direct_model(DiffOpt.diff_optimizer(SCS.Optimizer))
     set_silent(model)
     @variable(model, x)
     @variable(model, p in Parameter(3.0))
@@ -152,12 +142,7 @@ function test_diff_vector_rhs()
 end
 
 function test_affine_changes()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 3.0
     pc_val = 1.0
@@ -257,12 +242,7 @@ function test_affine_changes()
 end
 
 function test_affine_changes_compact()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 3.0
     pc_val = 1.0
@@ -315,12 +295,7 @@ function test_affine_changes_compact()
 end
 
 function test_quadratic_rhs_changes()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 2.0
     q_val = 2.0
@@ -444,12 +419,7 @@ function test_quadratic_rhs_changes()
 end
 
 function test_affine_changes_compact_max()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 3.0
     pc_val = 1.0
@@ -488,12 +458,7 @@ function test_affine_changes_compact_max()
 end
 
 function test_diff_affine_objective()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 3.0
     @variable(model, x)
@@ -528,12 +493,7 @@ function test_diff_affine_objective()
 end
 
 function test_diff_quadratic_objective()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 3.0
     @variable(model, x)
@@ -568,12 +528,7 @@ function test_diff_quadratic_objective()
 end
 
 function test_quadratic_objective_qp()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     p_val = 3.0
     @variable(model, x)
@@ -610,12 +565,7 @@ function test_quadratic_objective_qp()
 end
 
 function test_diff_errors_POI()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    model = direct_model(DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     @variable(model, x)
     @variable(model, p in Parameter(3.0))
@@ -672,12 +622,7 @@ function test_diff_errors_POI()
 end
 
 function test_diff_errors()
-    model = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = false,
-        ),
-    )
+    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     set_silent(model)
     @variable(model, x)
     @variable(model, p in Parameter(3.0))
@@ -730,12 +675,7 @@ end
 
 # Credit to @klamike
 function test_empty_cache()
-    m = Model(
-        () -> DiffOpt.diff_optimizer(
-            HiGHS.Optimizer;
-            with_parametric_opt_interface = true,
-        ),
-    )
+    m = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
     @variable(m, x)
     @variable(m, p ∈ Parameter(1.0))
     @variable(m, q ∈ Parameter(2.0))
@@ -747,10 +687,10 @@ function test_empty_cache()
 
     function get_sensitivity(m, xᵢ, pᵢ)
         DiffOpt.empty_input_sensitivities!(m)
-        @test is_empty(unsafe_backend(m).optimizer.input_cache)
-        if !isnothing(unsafe_backend(m).optimizer.diff) &&
-           !isnothing(unsafe_backend(m).optimizer.diff.model.input_cache)
-            @test is_empty(unsafe_backend(m).optimizer.diff.model.input_cache)
+        @test is_empty(unsafe_backend(m).input_cache)
+        if !isnothing(unsafe_backend(m).diff) &&
+           !isnothing(unsafe_backend(m).diff.optimizer.model.input_cache)
+            @test is_empty(unsafe_backend(m).diff.optimizer.model.input_cache)
         end
         MOI.set(
             m,