-
Notifications
You must be signed in to change notification settings - Fork 99
Expand file tree
/
Copy pathforward_over_reverse.jl
More file actions
441 lines (430 loc) · 17.7 KB
/
forward_over_reverse.jl
File metadata and controls
441 lines (430 loc) · 17.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# Copyright (c) 2017: Miles Lubin and contributors
# Copyright (c) 2017: Google Inc.
#
# Use of this source code is governed by an MIT-style license that can be found
# in the LICENSE.md file or at https://opensource.org/licenses/MIT.
const TAG = :ReverseAD
"""
_eval_hessian(
d::NLPEvaluator,
f::_FunctionStorage,
H::AbstractVector{Float64},
λ::Float64,
offset::Int,
)::Int
Evaluate the hessian matrix of the function `f` and store the result, scaled by
`λ`, in `H`, beginning at element `offset+1`. This function assumes that
`_reverse_mode(d, x)` has already been called.
Returns the number of non-zeros in the computed Hessian, which will be used to
update the offset for the next call.
"""
function _eval_hessian(
d::NLPEvaluator,
f::_FunctionStorage,
H::AbstractVector{Float64},
λ::Float64,
offset::Int,
)::Int
chunk = min(size(f.seed_matrix, 2), d.max_chunk)
# As a performance optimization, skip dynamic dispatch if the chunk is 1.
if chunk == 1
return _eval_hessian_inner(d, f, H, λ, offset, Val(1))
else
return _eval_hessian_inner(d, f, H, λ, offset, Val(chunk))
end
end
function _eval_hessian_inner(
d::NLPEvaluator,
ex::_FunctionStorage,
H::AbstractVector{Float64},
scale::Float64,
nzcount::Int,
::Val{CHUNK},
) where {CHUNK}
if ex.linearity == LINEAR
@assert length(ex.hess_I) == 0
return 0
end
T = ForwardDiff.Partials{CHUNK,Float64} # This is our element type.
Coloring.prepare_seed_matrix!(ex.seed_matrix, ex.rinfo)
local_to_global_idx = ex.rinfo.local_indices
input_ϵ_raw, output_ϵ_raw = d.input_ϵ, d.output_ϵ
input_ϵ = _reinterpret_unsafe(T, input_ϵ_raw)
output_ϵ = _reinterpret_unsafe(T, output_ϵ_raw)
# Compute hessian-vector products
num_products = size(ex.seed_matrix, 2) # number of hessian-vector products
num_chunks = div(num_products, CHUNK)
@assert size(ex.seed_matrix, 1) == length(local_to_global_idx)
for k in 1:CHUNK:(CHUNK*num_chunks)
for r in 1:length(local_to_global_idx)
# set up directional derivatives
@inbounds idx = local_to_global_idx[r]
# load up ex.seed_matrix[r,k,k+1,...,k+CHUNK-1] into input_ϵ
for s in 1:CHUNK
input_ϵ_raw[(idx-1)*CHUNK+s] = ex.seed_matrix[r, k+s-1]
end
@inbounds output_ϵ[idx] = zero(T)
end
_hessian_slice_inner(d, ex, input_ϵ, output_ϵ, T)
# collect directional derivatives
for r in 1:length(local_to_global_idx)
idx = local_to_global_idx[r]
# load output_ϵ into ex.seed_matrix[r,k,k+1,...,k+CHUNK-1]
for s in 1:CHUNK
ex.seed_matrix[r, k+s-1] = output_ϵ_raw[(idx-1)*CHUNK+s]
end
@inbounds input_ϵ[idx] = zero(T)
end
end
# leftover chunk
remaining = num_products - CHUNK * num_chunks
if remaining > 0
k = CHUNK * num_chunks + 1
for r in 1:length(local_to_global_idx)
# set up directional derivatives
@inbounds idx = local_to_global_idx[r]
# load up ex.seed_matrix[r,k,k+1,...,k+remaining-1] into input_ϵ
for s in 1:remaining
# leave junk in the unused components
input_ϵ_raw[(idx-1)*CHUNK+s] = ex.seed_matrix[r, k+s-1]
end
@inbounds output_ϵ[idx] = zero(T)
end
_hessian_slice_inner(d, ex, input_ϵ, output_ϵ, T)
# collect directional derivatives
for r in 1:length(local_to_global_idx)
idx = local_to_global_idx[r]
# load output_ϵ into ex.seed_matrix[r,k,k+1,...,k+remaining-1]
for s in 1:remaining
ex.seed_matrix[r, k+s-1] = output_ϵ_raw[(idx-1)*CHUNK+s]
end
@inbounds input_ϵ[idx] = zero(T)
end
end
want, got = nzcount + length(ex.hess_I), length(H)
if want > got
error(
"Vector provided for Hessian storage has too few elements. Got " *
"$got, want $want.",
)
end
# TODO(odow): consider reverting to a view.
output_slice = _UnsafeVectorView(nzcount, length(ex.hess_I), pointer(H))
Coloring.recover_from_matmat!(
output_slice,
ex.seed_matrix,
ex.rinfo,
d.output_ϵ,
)
for i in 1:length(output_slice)
output_slice[i] *= scale
end
return length(ex.hess_I)
end
function _hessian_slice_inner(d, ex, input_ϵ, output_ϵ, ::Type{T}) where {T}
subexpr_forward_values_ϵ =
_reinterpret_unsafe(T, d.subexpression_forward_values_ϵ)
for i in ex.dependent_subexpressions
subexpr = d.subexpressions[i]
subexpr_forward_values_ϵ[i] = _forward_eval_ϵ(
d,
subexpr,
_reinterpret_unsafe(T, d.storage_ϵ),
_reinterpret_unsafe(T, subexpr.partials_storage_ϵ),
input_ϵ,
subexpr_forward_values_ϵ,
d.data.operators,
)
end
_forward_eval_ϵ(
d,
ex,
_reinterpret_unsafe(T, d.storage_ϵ),
_reinterpret_unsafe(T, d.partials_storage_ϵ),
input_ϵ,
subexpr_forward_values_ϵ,
d.data.operators,
)
# do a reverse pass
subexpr_reverse_values_ϵ =
_reinterpret_unsafe(T, d.subexpression_reverse_values_ϵ)
for i in ex.dependent_subexpressions
subexpr_reverse_values_ϵ[i] = zero(T)
d.subexpression_reverse_values[i] = 0.0
end
_reverse_eval_ϵ(
output_ϵ,
ex,
_reinterpret_unsafe(T, d.storage_ϵ),
_reinterpret_unsafe(T, d.partials_storage_ϵ),
d.subexpression_reverse_values,
subexpr_reverse_values_ϵ,
1.0,
zero(T),
)
for i in length(ex.dependent_subexpressions):-1:1
j = ex.dependent_subexpressions[i]
subexpr = d.subexpressions[j]
_reverse_eval_ϵ(
output_ϵ,
subexpr,
_reinterpret_unsafe(T, d.storage_ϵ),
_reinterpret_unsafe(T, subexpr.partials_storage_ϵ),
d.subexpression_reverse_values,
subexpr_reverse_values_ϵ,
d.subexpression_reverse_values[j],
subexpr_reverse_values_ϵ[j],
)
end
return
end
"""
_forward_eval_ϵ(
d::NLPEvaluator,
ex::Union{_FunctionStorage,_SubexpressionStorage},
storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
partials_storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
x_values_ϵ,
subexpression_values_ϵ,
user_operators::Nonlinear.OperatorRegistry,
) where {N,T}
Evaluate the directional derivatives of the expression tree in `ex`.
This is equivalent to evaluating the expression tree using DualNumbers or
ForwardDiff, but instead we keep the `ex.forward_storage` for the epsilon
components separate so that we don't need to recompute the real components.
This assumes that `_reverse_model(d, x)` has already been called.
"""
function _forward_eval_ϵ(
d::NLPEvaluator,
ex::Union{_FunctionStorage,_SubexpressionStorage},
storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
partials_storage_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
x_values_ϵ,
subexpression_values_ϵ,
user_operators::Nonlinear.OperatorRegistry,
) where {N,T}
@assert length(storage_ϵ) >= length(ex.nodes)
@assert length(partials_storage_ϵ) >= length(ex.nodes)
zero_ϵ = zero(ForwardDiff.Partials{N,T})
# ex.nodes is already in order such that parents always appear before children
# so a backwards pass through ex.nodes is a forward pass through the tree
children_arr = SparseArrays.rowvals(ex.adj)
for k in length(ex.nodes):-1:1
node = ex.nodes[k]
partials_storage_ϵ[k] = zero_ϵ
if node.type == Nonlinear.NODE_VARIABLE
storage_ϵ[k] = x_values_ϵ[node.index]
elseif node.type == Nonlinear.NODE_VALUE
storage_ϵ[k] = zero_ϵ
elseif node.type == Nonlinear.NODE_SUBEXPRESSION
storage_ϵ[k] = subexpression_values_ϵ[node.index]
elseif node.type == Nonlinear.NODE_PARAMETER
storage_ϵ[k] = zero_ϵ
else
@assert node.type != Nonlinear.NODE_MOI_VARIABLE
ϵtmp = zero_ϵ
@inbounds children_idx = SparseArrays.nzrange(ex.adj, k)
for c_idx in children_idx
@inbounds ix = children_arr[c_idx]
@inbounds partial = ex.partials_storage[ix]
@inbounds storage_val = storage_ϵ[ix]
# TODO: This "if" statement can take 8% of the hessian
# evaluation time. Find a more efficient way.
if !isfinite(partial) && storage_val == zero_ϵ
continue
end
ϵtmp += storage_val * ex.partials_storage[ix]
end
storage_ϵ[k] = ϵtmp
if node.type == Nonlinear.NODE_CALL_MULTIVARIATE
# TODO(odow): consider how to refactor this into Nonlinear.
op = node.index
n_children = length(children_idx)
if op == 3 # :*
# Lazy approach for now.
anyzero = false
tmp_prod = one(ForwardDiff.Dual{TAG,T,N})
for c_idx in children_idx
ix = children_arr[c_idx]
sval = ex.forward_storage[ix]
gnum = ForwardDiff.Dual{TAG}(sval, storage_ϵ[ix])
tmp_prod *= gnum
anyzero = ifelse(sval * sval == zero(T), true, anyzero)
end
# By a quirk of floating-point numbers, we can have
# anyzero == true && ForwardDiff.value(tmp_prod) != zero(T)
if anyzero || n_children <= 2
for c_idx in children_idx
prod_others = one(ForwardDiff.Dual{TAG,T,N})
for c_idx2 in children_idx
(c_idx == c_idx2) && continue
ix = children_arr[c_idx2]
gnum = ForwardDiff.Dual{TAG}(
ex.forward_storage[ix],
storage_ϵ[ix],
)
prod_others *= gnum
end
partials_storage_ϵ[children_arr[c_idx]] =
ForwardDiff.partials(prod_others)
end
else
for c_idx in children_idx
ix = children_arr[c_idx]
prod_others =
tmp_prod / ForwardDiff.Dual{TAG}(
ex.forward_storage[ix],
storage_ϵ[ix],
)
partials_storage_ϵ[ix] =
ForwardDiff.partials(prod_others)
end
end
elseif op == 4 # :^
@assert n_children == 2
idx1 = first(children_idx)
idx2 = last(children_idx)
@inbounds ix1 = children_arr[idx1]
@inbounds ix2 = children_arr[idx2]
@inbounds base = ex.forward_storage[ix1]
@inbounds base_ϵ = storage_ϵ[ix1]
@inbounds exponent = ex.forward_storage[ix2]
@inbounds exponent_ϵ = storage_ϵ[ix2]
base_gnum = ForwardDiff.Dual{TAG}(base, base_ϵ)
exponent_gnum = ForwardDiff.Dual{TAG}(exponent, exponent_ϵ)
if exponent == 2
partials_storage_ϵ[ix1] = 2 * base_ϵ
elseif exponent == 1
partials_storage_ϵ[ix1] = zero_ϵ
else
partials_storage_ϵ[ix1] = ForwardDiff.partials(
exponent_gnum * pow(base_gnum, exponent_gnum - 1),
)
end
result_gnum = ForwardDiff.Dual{TAG}(
ex.forward_storage[k],
storage_ϵ[k],
)
# TODO(odow): fix me to use NaNMath.jl instead
log_base_gnum = base_gnum < 0 ? NaN : log(base_gnum)
partials_storage_ϵ[ix2] =
ForwardDiff.partials(result_gnum * log_base_gnum)
elseif op == 5 # :/
@assert n_children == 2
idx1 = first(children_idx)
idx2 = last(children_idx)
@inbounds ix1 = children_arr[idx1]
@inbounds ix2 = children_arr[idx2]
@inbounds numerator = ex.forward_storage[ix1]
@inbounds numerator_ϵ = storage_ϵ[ix1]
@inbounds denominator = ex.forward_storage[ix2]
@inbounds denominator_ϵ = storage_ϵ[ix2]
recip_denominator =
1 / ForwardDiff.Dual{TAG}(denominator, denominator_ϵ)
partials_storage_ϵ[ix1] =
ForwardDiff.partials(recip_denominator)
partials_storage_ϵ[ix2] = ForwardDiff.partials(
-ForwardDiff.Dual{TAG}(numerator, numerator_ϵ) *
recip_denominator *
recip_denominator,
)
elseif op > 6
f_input = _UnsafeVectorView(d.jac_storage, n_children)
for (i, c) in enumerate(children_idx)
f_input[i] = ex.forward_storage[children_arr[c]]
end
H = _UnsafeLowerTriangularMatrixView(
d.user_output_buffer,
n_children,
)
has_hessian = Nonlinear.eval_multivariate_hessian(
user_operators,
user_operators.multivariate_operators[node.index],
H,
f_input,
)
# This might be `false` if we extend this code to all
# multivariate functions.
@assert has_hessian
for col in 1:n_children
dual = zero(ForwardDiff.Partials{N,T})
for row in 1:n_children
# Make sure we get the lower-triangular component.
h = row >= col ? H[row, col] : H[col, row]
# Performance optimization: hessians can be quite
# sparse
if !iszero(h)
i = children_arr[children_idx[row]]
dual += h * storage_ϵ[i]
end
end
i = children_arr[children_idx[col]]
partials_storage_ϵ[i] = dual
end
end
elseif node.type == Nonlinear.NODE_CALL_UNIVARIATE
@inbounds child_idx = children_arr[ex.adj.colptr[k]]
f′′ = Nonlinear.eval_univariate_hessian(
user_operators,
node.index,
ex.forward_storage[child_idx],
)
partials_storage_ϵ[child_idx] = f′′ * storage_ϵ[child_idx]
end
end
end
return storage_ϵ[1]
end
# Compute directional derivatives of the reverse pass, goes with _forward_eval_ϵ
# to compute hessian-vector products.
function _reverse_eval_ϵ(
output_ϵ::AbstractVector{ForwardDiff.Partials{N,T}},
ex::Union{_FunctionStorage,_SubexpressionStorage},
reverse_storage_ϵ,
partials_storage_ϵ,
subexpression_output,
subexpression_output_ϵ,
scale::T,
scale_ϵ::ForwardDiff.Partials{N,T},
) where {N,T}
@assert length(reverse_storage_ϵ) >= length(ex.nodes)
@assert length(partials_storage_ϵ) >= length(ex.nodes)
if ex.nodes[1].type == Nonlinear.NODE_VARIABLE
@inbounds output_ϵ[ex.nodes[1].index] += scale_ϵ
return
elseif ex.nodes[1].type == Nonlinear.NODE_SUBEXPRESSION
@inbounds subexpression_output[ex.nodes[1].index] +=
scale * ex.reverse_storage[1]
@inbounds subexpression_output_ϵ[ex.nodes[1].index] += scale_ϵ
return
end
reverse_storage_ϵ[1] = scale_ϵ
for k in 2:length(ex.nodes)
@inbounds node = ex.nodes[k]
if node.type == Nonlinear.NODE_VALUE ||
node.type == Nonlinear.NODE_LOGIC ||
node.type == Nonlinear.NODE_COMPARISON ||
node.type == Nonlinear.NODE_PARAMETER
continue
end
parent_value = scale * ex.reverse_storage[node.parent]
if !isfinite(ex.partials_storage[k]) && iszero(parent_value)
reverse_storage_ϵ[k] = zero(ForwardDiff.Partials{N,T})
else
reverse_storage_ϵ[k] = ForwardDiff._mul_partials(
partials_storage_ϵ[k],
reverse_storage_ϵ[node.parent],
parent_value,
ex.partials_storage[k],
)
end
if node.type == Nonlinear.NODE_VARIABLE
@inbounds output_ϵ[node.index] += reverse_storage_ϵ[k]
elseif node.type == Nonlinear.NODE_SUBEXPRESSION
@inbounds subexpression_output[node.index] +=
scale * ex.reverse_storage[k]
@inbounds subexpression_output_ϵ[node.index] += reverse_storage_ϵ[k]
end
end
return
end