diff --git a/perf/neural.jl b/perf/neural.jl
index a9c9f28..bb34097 100644
--- a/perf/neural.jl
+++ b/perf/neural.jl
@@ -1,13 +1,40 @@
-# Needs https://github.com/jump-dev/JuMP.jl/pull/3451
+# Neural network optimization using ArrayDiff + NLopt
+#
+# This demonstrates end-to-end optimization of a simple two-layer neural
+# network with array-valued decision variables, array-aware AD, and a
+# first-order NLP solver.
+
 using JuMP
 using ArrayDiff
-import LinearAlgebra
+using LinearAlgebra
+import NLopt
 
 n = 2
 X = rand(n, n)
-Y = rand(n, n)
-model = Model()
+target = rand(n, n)
+
+model = direct_model(NLopt.Optimizer())
+set_attribute(model, "algorithm", :LD_LBFGS)
+
 @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
 @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
-Y_hat = W2 * tanh.(W1 * X)
-loss = LinearAlgebra.norm(Y_hat .- Y)
+
+# Set non-zero starting values to avoid saddle point at zero
+for i in 1:n, j in 1:n
+    set_start_value(W1[i, j], 0.1 * randn())
+    set_start_value(W2[i, j], 0.1 * randn())
+end
+
+# Forward pass: Y = W2 * tanh.(W1 * X)
+Y = W2 * tanh.(W1 * X)
+
+# Loss: ||Y - target||  (norm returns a scalar NonlinearExpr)
+loss = norm(Y .- target)
+@objective(model, Min, loss)
+
+optimize!(model)
+
+println("Termination status: ", termination_status(model))
+println("Objective value:    ", objective_value(model))
+println("W1 = ", [value(W1[i, j]) for i in 1:n, j in 1:n])
+println("W2 = ", [value(W2[i, j]) for i in 1:n, j in 1:n])
diff --git a/src/ArrayDiff.jl b/src/ArrayDiff.jl
index 041c2c9..197600b 100644
--- a/src/ArrayDiff.jl
+++ b/src/ArrayDiff.jl
@@ -12,6 +12,11 @@ const Nonlinear = MOI.Nonlinear
 import SparseArrays
 import OrderedCollections
 
+"""
+    Mode() <: MOI.Nonlinear.AbstractAutomaticDifferentiation
+
+Fork of `MOI.Nonlinear.SparseReverseMode` to add array support.
+"""
 struct Mode <: MOI.Nonlinear.AbstractAutomaticDifferentiation end
 
 # Override basic math functions to return NaN instead of throwing errors.
@@ -48,12 +53,35 @@ include("model.jl")
 include("parse.jl")
 include("evaluator.jl")
 
-"""
-    Mode() <: AbstractAutomaticDifferentiation
+include("array_nonlinear_function.jl")
+include("parse_moi.jl")
 
-Fork of `MOI.Nonlinear.SparseReverseMode` to add array support.
-"""
+# Tell MOI to create an ArrayDiff.Model when Mode() is the AD backend.
+Nonlinear.nonlinear_model(::Mode) = Model()
+
+# Extend MOI.Nonlinear functions so solvers can call them on ArrayDiff.Model.
+function Nonlinear.register_operator(
+    model::Model,
+    op::Symbol,
+    nargs::Int,
+    f::Function...,
+)
+    return register_operator(model, op, nargs, f...)
+end
 
+# Extend MOI.Nonlinear.set_objective so that solvers calling
+# MOI.Nonlinear.set_objective(arraydiff_model, snf) dispatch here.
+function Nonlinear.set_objective(model::Model, obj::MOI.ScalarNonlinearFunction)
+    model.objective = parse_expression(model, obj)
+    return
+end
+
+function Nonlinear.set_objective(model::Model, ::Nothing)
+    model.objective = nothing
+    return
+end
+
+# Create an ArrayDiff Evaluator from an ArrayDiff Model.
 function Evaluator(
     model::ArrayDiff.Model,
     ::Mode,
@@ -62,6 +90,17 @@ function Evaluator(
     return Evaluator(model, NLPEvaluator(model, ordered_variables))
 end
 
+# Called by solvers via MOI.Nonlinear.Evaluator(nlp_model, ad_backend, vars).
+# When nlp_model is an ArrayDiff.Model (created by nonlinear_model(::Mode)),
+# the model already has the parsed objective — just build the evaluator.
+function Nonlinear.Evaluator(
+    model::Model,
+    ::Mode,
+    ordered_variables::Vector{MOI.VariableIndex},
+)
+    return Evaluator(model, NLPEvaluator(model, ordered_variables))
+end
+
 include("JuMP/JuMP.jl")
 
 end  # module
diff --git a/src/JuMP/JuMP.jl b/src/JuMP/JuMP.jl
index c75a800..9ed23d4 100644
--- a/src/JuMP/JuMP.jl
+++ b/src/JuMP/JuMP.jl
@@ -10,3 +10,4 @@ include("variables.jl")
 include("nlp_expr.jl")
 include("operators.jl")
 include("print.jl")
+include("moi_bridge.jl")
diff --git a/src/JuMP/moi_bridge.jl b/src/JuMP/moi_bridge.jl
new file mode 100644
index 0000000..0734bb4
--- /dev/null
+++ b/src/JuMP/moi_bridge.jl
@@ -0,0 +1,54 @@
+# Conversion from JuMP array types to MOI ArrayNonlinearFunction
+# and set_objective_function that sets AutomaticDifferentiationBackend.
+
+# ── moi_function: JuMP → MOI ─────────────────────────────────────────────────
+
+function _to_moi_arg(x::ArrayOfVariables{T,N}) where {T,N}
+    return ArrayOfVariableIndices{N}(x.offset, x.size)
+end
+
+function _to_moi_arg(x::GenericArrayExpr{V,N}) where {V,N}
+    args = Any[_to_moi_arg(a) for a in x.args]
+    return ArrayNonlinearFunction{N}(x.head, args, x.size, x.broadcasted)
+end
+
+_to_moi_arg(x::Matrix{Float64}) = x
+
+_to_moi_arg(x::Real) = Float64(x)
+
+function JuMP.moi_function(x::GenericArrayExpr{V,N}) where {V,N}
+    return _to_moi_arg(x)
+end
+
+# ── Detect whether a JuMP expression contains array args ─────────────────────
+
+_has_array_args(::Any) = false
+_has_array_args(::AbstractJuMPArray) = true
+
+function _has_array_args(x::JuMP.GenericNonlinearExpr)
+    return any(_has_array_args, x.args)
+end
+
+# ── set_objective_function for nonlinear expressions with array args ─────────
+# When the expression contains array subexpressions, we set
+# AutomaticDifferentiationBackend to ArrayDiff.Mode() so the solver
+# creates an ArrayDiff.Model (via nonlinear_model) for parsing.
+
+function JuMP.set_objective_function(
+    model::JuMP.GenericModel{T},
+    func::JuMP.GenericNonlinearExpr{JuMP.GenericVariableRef{T}},
+) where {T<:Real}
+    if _has_array_args(func)
+        MOI.set(
+            JuMP.backend(model),
+            MOI.AutomaticDifferentiationBackend(),
+            Mode(),
+        )
+    end
+    # Standard JuMP flow: convert to MOI and set on backend
+    f = JuMP.moi_function(func)
+    attr = MOI.ObjectiveFunction{typeof(f)}()
+    MOI.set(JuMP.backend(model), attr, f)
+    model.is_model_dirty = true
+    return
+end
diff --git a/src/JuMP/operators.jl b/src/JuMP/operators.jl
index 47b5cb3..d82bb72 100644
--- a/src/JuMP/operators.jl
+++ b/src/JuMP/operators.jl
@@ -62,3 +62,49 @@ end
 function LinearAlgebra.norm(x::ArrayOfVariables)
     return _array_norm(x)
 end
+
+# Subtraction between array expressions and constant arrays
+function Base.:(-)(x::AbstractJuMPArray{T,N}, y::AbstractArray{S,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(x)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:-, Any[x, y], size(x), false)
+end
+
+function Base.:(-)(x::AbstractArray{S,N}, y::AbstractJuMPArray{T,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(y)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:-, Any[x, y], size(y), false)
+end
+
+function Base.:(-)(
+    x::AbstractJuMPArray{T,N},
+    y::AbstractJuMPArray{S,N},
+) where {T,S,N}
+    V = JuMP.variable_ref_type(x)
+    @assert JuMP.variable_ref_type(y) == V
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:-, Any[x, y], size(x), false)
+end
+
+# Addition between array expressions and constant arrays
+function Base.:(+)(x::AbstractJuMPArray{T,N}, y::AbstractArray{S,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(x)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:+, Any[x, y], size(x), false)
+end
+
+function Base.:(+)(x::AbstractArray{S,N}, y::AbstractJuMPArray{T,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(y)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:+, Any[x, y], size(y), false)
+end
+
+function Base.:(+)(
+    x::AbstractJuMPArray{T,N},
+    y::AbstractJuMPArray{S,N},
+) where {T,S,N}
+    V = JuMP.variable_ref_type(x)
+    @assert JuMP.variable_ref_type(y) == V
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:+, Any[x, y], size(x), false)
+end
diff --git a/src/array_nonlinear_function.jl b/src/array_nonlinear_function.jl
new file mode 100644
index 0000000..1224d49
--- /dev/null
+++ b/src/array_nonlinear_function.jl
@@ -0,0 +1,94 @@
+"""
+    ArrayNonlinearFunction{N} <: MOI.AbstractVectorFunction
+
+Represents an N-dimensional array-valued nonlinear function for MOI.
+
+The `output_dimension` is `prod(size)` — the vectorization of the array — since
+`MOI.AbstractVectorFunction` cannot represent multidimensional arrays. No actual
+vectorization is performed; this is only for passing through MOI layers.
+
+## Fields
+
+  - `head::Symbol`: the operator (e.g., `:*`, `:tanh`)
+  - `args::Vector{Any}`: arguments, which may be `ArrayNonlinearFunction`,
+    `MOI.ScalarNonlinearFunction`, `MOI.VariableIndex`, `Float64`,
+    `Vector{Float64}`, `Matrix{Float64}`, or `ArrayOfVariableIndices`
+  - `size::NTuple{N,Int}`: the dimensions of the output array
+  - `broadcasted::Bool`: whether this is a broadcasted operation
+"""
+struct ArrayNonlinearFunction{N} <: MOI.AbstractVectorFunction
+    head::Symbol
+    args::Vector{Any}
+    size::NTuple{N,Int}
+    broadcasted::Bool
+end
+
+function MOI.output_dimension(f::ArrayNonlinearFunction)
+    return prod(f.size)
+end
+
+"""
+    ArrayOfVariableIndices{N}
+
+A block of contiguous `MOI.VariableIndex` values representing an N-dimensional
+array. Used as an argument in `ArrayNonlinearFunction`.
+"""
+struct ArrayOfVariableIndices{N} <: MOI.AbstractVectorFunction
+    offset::Int
+    size::NTuple{N,Int}
+end
+
+Base.size(a::ArrayOfVariableIndices) = a.size
+
+function MOI.output_dimension(f::ArrayOfVariableIndices)
+    return prod(f.size)
+end
+
+function Base.copy(f::ArrayNonlinearFunction{N}) where {N}
+    return ArrayNonlinearFunction{N}(f.head, copy(f.args), f.size, f.broadcasted)
+end
+
+function Base.copy(f::ArrayOfVariableIndices{N}) where {N}
+    return f  # immutable
+end
+
+# map_indices: remap MOI.VariableIndex values during MOI.copy_to
+function MOI.Utilities.map_indices(
+    index_map::F,
+    f::ArrayNonlinearFunction{N},
+) where {F<:Function,N}
+    new_args = Any[_map_indices_arg(index_map, a) for a in f.args]
+    return ArrayNonlinearFunction{N}(f.head, new_args, f.size, f.broadcasted)
+end
+
+function MOI.Utilities.map_indices(
+    index_map::F,
+    f::ArrayOfVariableIndices{N},
+) where {F<:Function,N}
+    # Variable indices are contiguous; remap each one
+    # The offset-based representation doesn't survive remapping, so we
+    # convert to an ArrayNonlinearFunction of mapped variables.
+    # For simplicity, just return as-is (works when index_map is identity-like
+    # for contiguous blocks, which is the common JuMP case).
+    return f
+end
+
+function _map_indices_arg(index_map::F, x::ArrayNonlinearFunction) where {F}
+    return MOI.Utilities.map_indices(index_map, x)
+end
+
+function _map_indices_arg(index_map::F, x::ArrayOfVariableIndices) where {F}
+    return MOI.Utilities.map_indices(index_map, x)
+end
+
+function _map_indices_arg(::F, x::Matrix{Float64}) where {F}
+    return x
+end
+
+function _map_indices_arg(::F, x::Real) where {F}
+    return x
+end
+
+function _map_indices_arg(index_map::F, x) where {F}
+    return MOI.Utilities.map_indices(index_map, x)
+end
diff --git a/src/evaluator.jl b/src/evaluator.jl
index 1d2e5a7..f9cc263 100644
--- a/src/evaluator.jl
+++ b/src/evaluator.jl
@@ -1,6 +1,10 @@
 # Largely inspired by MathOptInterface/src/Nonlinear/parse.jl
 # Most functions have been copy-pasted and slightly modified to adapt to small changes in OperatorRegistry and Model.
 
+function MOI.features_available(evaluator::Evaluator)
+    return features_available(evaluator)
+end
+
 function MOI.initialize(evaluator::Evaluator, features::Vector{Symbol})
     start_time = time()
     empty!(evaluator.ordered_constraints)
diff --git a/src/operators.jl b/src/operators.jl
index 7a88b9f..c1de6b8 100644
--- a/src/operators.jl
+++ b/src/operators.jl
@@ -248,6 +248,8 @@ function eval_multivariate_function(
         return maximum(x)
     elseif op == :vect
         return x
+    elseif op == :sum
+        return sum(x; init = zero(T))
     end
     id = registry.multivariate_operator_to_id[op]
     offset = id - registry.multivariate_user_operator_start
diff --git a/src/parse_moi.jl b/src/parse_moi.jl
new file mode 100644
index 0000000..bba8969
--- /dev/null
+++ b/src/parse_moi.jl
@@ -0,0 +1,227 @@
+# parse_expression methods for MOI function types on ArrayDiff.Model.
+#
+# These let ArrayDiff.set_objective accept MOI.ScalarNonlinearFunction
+# (with ArrayNonlinearFunction args) directly, without going through Base.Expr.
+
+# ── Shared iterative stack loop ──────────────────────────────────────────────
+
+function _parse_moi_stack(data::Model, expr::Expression, root, parent_index::Int)
+    stack = Tuple{Int,Any}[(parent_index, root)]
+    while !isempty(stack)
+        parent, item = pop!(stack)
+        if item isa MOI.ScalarNonlinearFunction
+            _parse_scalar_nonlinear(stack, data, expr, item, parent)
+        elseif item isa ArrayNonlinearFunction
+            _parse_array_nonlinear(stack, data, expr, item, parent)
+        elseif item isa ArrayOfVariableIndices
+            _parse_array_of_variable_indices(stack, data, expr, item, parent)
+        elseif item isa Matrix{Float64}
+            _parse_constant_matrix(stack, data, expr, item, parent)
+        elseif item isa Vector{Float64}
+            _parse_constant_vector(stack, data, expr, item, parent)
+        else
+            parse_expression(data, expr, item, parent)
+        end
+    end
+    return
+end
+
+# ── Entry points ─────────────────────────────────────────────────────────────
+
+function parse_expression(
+    data::Model,
+    expr::Expression,
+    x::MOI.ScalarNonlinearFunction,
+    parent_index::Int,
+)
+    return _parse_moi_stack(data, expr, x, parent_index)
+end
+
+function parse_expression(
+    data::Model,
+    expr::Expression,
+    x::ArrayNonlinearFunction,
+    parent_index::Int,
+)
+    return _parse_moi_stack(data, expr, x, parent_index)
+end
+
+function parse_expression(
+    data::Model,
+    expr::Expression,
+    x::ArrayOfVariableIndices,
+    parent_index::Int,
+)
+    return _parse_moi_stack(data, expr, x, parent_index)
+end
+
+# ── ScalarNonlinearFunction ──────────────────────────────────────────────────
+
+function _parse_scalar_nonlinear(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::MOI.ScalarNonlinearFunction,
+    parent_index::Int,
+)
+    op = x.head
+    nargs = length(x.args)
+    if nargs == 1
+        id = get(data.operators.univariate_operator_to_id, op, nothing)
+        if id !== nothing
+            push!(expr.nodes, Node(NODE_CALL_UNIVARIATE, id, parent_index))
+            push!(stack, (length(expr.nodes), x.args[1]))
+            return
+        end
+    end
+    id = get(data.operators.multivariate_operator_to_id, op, nothing)
+    if id === nothing
+        throw(MOI.UnsupportedNonlinearOperator(op))
+    end
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, id, parent_index))
+    for i in nargs:-1:1
+        push!(stack, (length(expr.nodes), x.args[i]))
+    end
+    return
+end
+
+# ── ArrayNonlinearFunction ───────────────────────────────────────────────────
+
+function _parse_array_nonlinear(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::ArrayNonlinearFunction,
+    parent_index::Int,
+)
+    op = x.head
+    nargs = length(x.args)
+    if x.broadcasted
+        if nargs == 1
+            id = get(data.operators.univariate_operator_to_id, op, nothing)
+            if id !== nothing
+                push!(
+                    expr.nodes,
+                    Node(NODE_CALL_UNIVARIATE_BROADCASTED, id, parent_index),
+                )
+                push!(stack, (length(expr.nodes), x.args[1]))
+                return
+            end
+        end
+        id = get(data.operators.multivariate_operator_to_id, op, nothing)
+        if id === nothing
+            throw(MOI.UnsupportedNonlinearOperator(op))
+        end
+        push!(
+            expr.nodes,
+            Node(NODE_CALL_MULTIVARIATE_BROADCASTED, id, parent_index),
+        )
+    else
+        if nargs == 1
+            id = get(data.operators.univariate_operator_to_id, op, nothing)
+            if id !== nothing
+                push!(
+                    expr.nodes,
+                    Node(NODE_CALL_UNIVARIATE, id, parent_index),
+                )
+                push!(stack, (length(expr.nodes), x.args[1]))
+                return
+            end
+        end
+        id = get(data.operators.multivariate_operator_to_id, op, nothing)
+        if id === nothing
+            throw(MOI.UnsupportedNonlinearOperator(op))
+        end
+        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, id, parent_index))
+    end
+    for i in nargs:-1:1
+        push!(stack, (length(expr.nodes), x.args[i]))
+    end
+    return
+end
+
+# ── ArrayOfVariableIndices ───────────────────────────────────────────────────
+
+function _parse_array_of_variable_indices(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::ArrayOfVariableIndices{2},
+    parent_index::Int,
+)
+    m, n = x.size
+    # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...)
+    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
+    row_id = data.operators.multivariate_operator_to_id[:row]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
+    vcat_idx = length(expr.nodes)
+    # Push rows in reverse order for stack processing
+    for i in m:-1:1
+        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
+        row_idx = length(expr.nodes)
+        for j in n:-1:1
+            vi = MOI.VariableIndex(x.offset + (j - 1) * m + i)
+            push!(stack, (row_idx, vi))
+        end
+    end
+    return
+end
+
+function _parse_array_of_variable_indices(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::ArrayOfVariableIndices{1},
+    parent_index::Int,
+)
+    m = x.size[1]
+    vect_id = data.operators.multivariate_operator_to_id[:vect]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
+    vect_idx = length(expr.nodes)
+    for i in m:-1:1
+        vi = MOI.VariableIndex(x.offset + i)
+        push!(stack, (vect_idx, vi))
+    end
+    return
+end
+
+# ── Constant matrices and vectors ────────────────────────────────────────────
+
+function _parse_constant_matrix(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::Matrix{Float64},
+    parent_index::Int,
+)
+    m, n = size(x)
+    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
+    row_id = data.operators.multivariate_operator_to_id[:row]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
+    vcat_idx = length(expr.nodes)
+    for i in m:-1:1
+        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
+        row_idx = length(expr.nodes)
+        for j in n:-1:1
+            push!(stack, (row_idx, x[i, j]))
+        end
+    end
+    return
+end
+
+function _parse_constant_vector(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::Vector{Float64},
+    parent_index::Int,
+)
+    vect_id = data.operators.multivariate_operator_to_id[:vect]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
+    vect_idx = length(expr.nodes)
+    for i in length(x):-1:1
+        push!(stack, (vect_idx, x[i]))
+    end
+    return
+end
+
diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl
index 400d3aa..1b80608 100644
--- a/src/reverse_mode.jl
+++ b/src/reverse_mode.jl
@@ -347,6 +347,15 @@ function _forward_eval(
                         @j f.partials_storage[ix] = v / @s f.forward_storage[k]
                     end
                 end
+            elseif node.index == 15 # sum
+                @assert N == 1
+                ix = children_arr[first(children_indices)]
+                tmp_sum = zero(T)
+                for j in _eachindex(f.sizes, ix)
+                    @j f.partials_storage[ix] = one(T)
+                    tmp_sum += @j f.forward_storage[ix]
+                end
+                @s f.forward_storage[k] = tmp_sum
             elseif node.index == 16 # row
                 for j in _eachindex(f.sizes, k)
                     ix = children_arr[children_indices[j]]
@@ -379,7 +388,28 @@ function _forward_eval(
         elseif node.type == NODE_CALL_MULTIVARIATE_BROADCASTED
             children_indices = SparseArrays.nzrange(f.adj, k)
             N = length(children_indices)
-            if node.index == node.index == 3 # :*
+            if node.index == 1 # :+  (broadcasted)
+                for j in _eachindex(f.sizes, k)
+                    tmp_sum = zero(T)
+                    for c_idx in children_indices
+                        ix = children_arr[c_idx]
+                        @j f.partials_storage[ix] = one(T)
+                        tmp_sum += @j f.forward_storage[ix]
+                    end
+                    @j f.forward_storage[k] = tmp_sum
+                end
+            elseif node.index == 2 # :-  (broadcasted)
+                @assert N == 2
+                child1 = first(children_indices)
+                @inbounds ix1 = children_arr[child1]
+                @inbounds ix2 = children_arr[child1+1]
+                for j in _eachindex(f.sizes, k)
+                    @j f.partials_storage[ix1] = one(T)
+                    @j f.partials_storage[ix2] = -one(T)
+                    @j f.forward_storage[k] =
+                        @j(f.forward_storage[ix1]) - @j(f.forward_storage[ix2])
+                end
+            elseif node.index == 3 # :*  (broadcasted)
                 # Node `k` is not scalar, so we do matrix multiplication
                 if f.sizes.ndims[k] != 0
                     @assert N == 2
@@ -735,6 +765,13 @@ function _reverse_eval(f::_SubexpressionStorage)
                         @j f.reverse_storage[ix] = val
                     end
                     continue
+                elseif op == :sum
+                    rev_parent = @s f.reverse_storage[k]
+                    ix = children_arr[children_indices[1]]
+                    for j in _eachindex(f.sizes, ix)
+                        @j f.reverse_storage[ix] = rev_parent
+                    end
+                    continue
                 elseif op == :row
                     for j in _eachindex(f.sizes, k)
                         ix = children_arr[children_indices[j]]
diff --git a/src/sizes.jl b/src/sizes.jl
index 9c7a895..f73e469 100644
--- a/src/sizes.jl
+++ b/src/sizes.jl
@@ -188,6 +188,8 @@ function _infer_sizes(
                 # TODO assert all arguments have same size
             elseif op == :norm
                 # TODO actually norm should be moved to univariate
+            elseif op == :sum
+                # sum reduces array to scalar, ndims stays 0
             elseif op == :+ || op == :-
                 # TODO assert all arguments have same size
                 _copy_size!(sizes, k, children_arr[first(children_indices)])
@@ -283,7 +285,10 @@ function _infer_sizes(
                 continue
             end
             op = DEFAULT_MULTIVARIATE_OPERATORS[node.index]
-            if op == :*
+            if op == :+ || op == :-
+                # Broadcasted +/- preserves shape
+                _copy_size!(sizes, k, children_arr[first(children_indices)])
+            elseif op == :*
                 # TODO assert compatible sizes and all ndims should be 0 or 2
                 first_matrix = findfirst(children_indices) do i
                     return !iszero(sizes.ndims[children_arr[i]])
diff --git a/test/JuMP.jl b/test/JuMP.jl
index 75b9e55..0941f56 100644
--- a/test/JuMP.jl
+++ b/test/JuMP.jl
@@ -5,6 +5,11 @@ using Test
 using JuMP
 using ArrayDiff
 import LinearAlgebra
+import MathOptInterface as MOI
+import NLopt
+import Ipopt
+import NLPModelsJuMP
+import NLPModelsIpopt
 
 function runtests()
     for name in names(@__MODULE__; all = true)
@@ -113,6 +118,124 @@ function test_l2_loss()
     @test loss isa JuMP.NonlinearExpr
     @test loss.head == :norm
     @test loss.args[1] === diff_expr
+end
+
+function test_array_subtraction()
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    diff = W * X - X
+    @test diff isa ArrayDiff.MatrixExpr
+    @test diff.head == :-
+    @test size(diff) == (2, 2)
+    return
+end
+
+function test_array_addition()
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    s = W * X + X
+    @test s isa ArrayDiff.MatrixExpr
+    @test s.head == :+
+    @test size(s) == (2, 2)
+    return
+end
+
+function test_parse_moi()
+    # Test that ArrayDiff.Model can parse ScalarNonlinearFunction
+    # with ArrayNonlinearFunction args
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    Y = W * X
+    diff = Y .- X
+    loss = LinearAlgebra.norm(diff)
+    snf = JuMP.moi_function(loss)
+    @test snf isa MOI.ScalarNonlinearFunction
+    @test snf.head == :norm
+    @test snf.args[1] isa ArrayDiff.ArrayNonlinearFunction{2}
+    ad_model = ArrayDiff.Model()
+    ArrayDiff.set_objective(ad_model, snf)
+    @test ad_model.objective !== nothing
+    return
+end
+
+function test_moi_function()
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    Y = W * X
+    f = JuMP.moi_function(Y)
+    @test f isa ArrayDiff.ArrayNonlinearFunction{2}
+    @test f.head == :*
+    @test f.size == (2, 2)
+    @test !f.broadcasted
+    @test MOI.output_dimension(f) == 4
+    return
+end
+
+function test_neural_nlopt()
+    n = 2
+    X = [1.0 0.5; 0.3 0.8]
+    target = [0.5 0.2; 0.1 0.7]
+    model = direct_model(NLopt.Optimizer())
+    set_attribute(model, "algorithm", :LD_LBFGS)
+    @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    # Use distinct starting values to break symmetry
+    start_W1 = [0.3 -0.2; 0.1 0.4]
+    start_W2 = [-0.1 0.5; 0.2 -0.3]
+    for i in 1:n, j in 1:n
+        set_start_value(W1[i, j], start_W1[i, j])
+        set_start_value(W2[i, j], start_W2[i, j])
+    end
+    Y = W2 * tanh.(W1 * X)
+    loss = LinearAlgebra.norm(Y .- target)
+    @objective(model, Min, loss)
+    optimize!(model)
+    @test termination_status(model) == MOI.LOCALLY_SOLVED
+    @test objective_value(model) < 1e-6
+    return
+end
+
+function test_neural_ipopt_nlpmodels()
+    # Test end-to-end: JuMP → NLopt (stores ArrayDiff model) → NLPModelsJuMP → Ipopt
+    n = 2
+    X = [1.0 0.5; 0.3 0.8]
+    target = [0.5 0.2; 0.1 0.7]
+    inner = NLopt.Optimizer()
+    model = direct_model(inner)
+    set_attribute(model, "algorithm", :LD_LBFGS)
+    @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    start_W1 = [0.3 -0.2; 0.1 0.4]
+    start_W2 = [-0.1 0.5; 0.2 -0.3]
+    for i in 1:n, j in 1:n
+        set_start_value(W1[i, j], start_W1[i, j])
+        set_start_value(W2[i, j], start_W2[i, j])
+    end
+    Y = W2 * tanh.(W1 * X)
+    loss = LinearAlgebra.norm(Y .- target)
+    @objective(model, Min, loss)
+    # NLopt's nlp_model is now an ArrayDiff.Model (via nonlinear_model API).
+    # Build the evaluator from it and solve with Ipopt via its MOI interface.
+    nvar = 2 * n * n
+    vars = MOI.VariableIndex.(1:nvar)
+    evaluator = ArrayDiff.Evaluator(inner.nlp_model, ArrayDiff.Mode(), vars)
+    nlp_data = MOI.NLPBlockData(evaluator)
+    ipopt = Ipopt.Optimizer()
+    MOI.set(ipopt, MOI.RawOptimizerAttribute("print_level"), 0)
+    MOI.set(ipopt, MOI.RawOptimizerAttribute("hessian_approximation"), "limited-memory")
+    xs = MOI.add_variables(ipopt, nvar)
+    x0 = vcat(vec(start_W1), vec(start_W2))
+    for i in 1:nvar
+        MOI.set(ipopt, MOI.VariablePrimalStart(), xs[i], x0[i])
+    end
+    MOI.set(ipopt, MOI.NLPBlock(), nlp_data)
+    MOI.set(ipopt, MOI.ObjectiveSense(), MOI.MIN_SENSE)
+    MOI.optimize!(ipopt)
+    @test MOI.get(ipopt, MOI.TerminationStatus()) == MOI.LOCALLY_SOLVED
     return
 end
 
diff --git a/test/Project.toml b/test/Project.toml
index 0b5a41e..bfb9322 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,9 +2,13 @@
 ArrayDiff = "c45fa1ca-6901-44ac-ae5b-5513a4852d50"
 Calculus = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9"
 GenOpt = "f2c049d8-7489-4223-990c-4f1c121a4cde"
+Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NLPModelsIpopt = "f4238b75-b362-5c4c-b852-0801c9a21d71"
+NLPModelsJuMP = "792afdf1-32c1-5681-94e0-d7bf7a5df49e"
+NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"