Deeplearning2017 Johnson Automatic Differentiation 01
Deeplearning2017 Johnson Automatic Differentiation 01
brain
Our awesome new world
Our awesome new world
• github.com/hips/autograd
• differentiates native Python code
• derivatives of derivatives
def
defpredict ( weights , inputs):
predict(weights, inputs ):
for
forW ,W, bb in
in weights
weights::
outputs = np . dot ( inputs , W ) + b
outputs = np.dot(inputs, W) + b
inputs = np . tanh ( outputs )
returninputs = np.tanh(outputs)
outputs
return outputs
def init_params ( scale , sizes ):
defreturn [( npr . randn (sizes):
init_params(scale, nin , out ) * scale ,
npr . randn ( outn)
return [(npr.randn(m, ) ** scale
scale,)
for nin , out in
npr.randn(n) * scale)
zip ( sizes [: -1] , sizes [1:])]
for m, n in zip(sizes[:-1], sizes[1:])]
def logprob_func ( weights , inputs , targets ):
defpreds
logprob_fun(params, inputs,
= predict ( weights targets):
, inputs )
preds =nppredict(weights,
return inputs))**2)
. sum (( preds - targets
return np.sum((preds - targets)**2)
gradient_func = grad ( logprob_func )
gradient_fun = grad(logprob_fun)
return [(npr.randn(m, n) * scale,
npr.randn(n) * scale)
Autograd examples
for m, n in zip(sizes[:-1], sizes[1:])]
gradient_fun = grad(logprob_fun)
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
x = np.linspace(-7, 7, 200)
plt.plot(x, np.tanh(x),
x, grad(np.tanh)(x), # first deriva
x, grad(grad(np.tanh))(x), # second deriv
x, grad(grad(grad(np.tanh)))(x), # third deriva
x, grad(grad(grad(grad(np.tanh))))(x), # fourth deriv
x, grad(grad(grad(grad(grad(np.tanh)))))(x), # fifth deriva
x, grad(grad(grad(grad(grad(grad(np.tanh))))))(x)) # sixth deriva
def hvp(fun):
def grad_dot_vector(arg, vector):
return np.dot(grad(fun)(arg), vector)
return grad(grad_dot_vector)
2
r f (x) · v = rx (rx f (x) · v)
1
ButBlack-box inference
what about inference? in a tweet
Stan also provides inference routines...
Tutorial goals
2. Autograd’s implementation
• Fully closed tracing autodiff in Python
2. Autograd’s implementation
• Fully closed tracing autodiff in Python
F =D C B A
F : Rn ! R F : 7!
y2R
n
x2R
F =D C B A y = F (x) = D(C(B(A(x))))
F : Rn ! R F : 7!
y2R
n
x2R
F =D C B A y = F (x) = D(C(B(A(x))))
@y h i
0 @y @y
F (x) = = @x1 ··· @xn
@x
y = D(c), c = C(b), b = B(a), a = A(x)
@y h i
0 @y @y
F (x) = = @x1 ··· @xn
@x
0 @y @c @b @a
F (x) =
@c @b @a @x
y = D(c), c = C(b), b = B(a), a = A(x)
@y h i
0 @y @y
F (x) = = @x1 ··· @xn
@x
0 @y @c @b @a
F (x) =
@c @b @a @x
@y
= D0 (c)
@c
y = D(c), c = C(b), b = B(a), a = A(x)
@y h i
0 @y @y
F (x) = = @x1 ··· @xn
@x
0 @y @c @b @a
F (x) =
@c @b @a @x
@y @c
= D0 (c) = C 0 (b)
@c @b
y = D(c), c = C(b), b = B(a), a = A(x)
@y h i
0 @y @y
F (x) = = @x1 ··· @xn
@x
0 @y @c @b @a
F (x) =
@c @b @a @x
@y @c @b @a
= D0 (c) = C 0 (b) = B 0 (a) = A0 (x)
@c @b @a @x
✓ ✓ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
✓ ✓ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
(
2 @b1 @b1 3
@x1 ··· @xn
@b 6 .. .. .. 7
=4 . . . 5
@x @b @bm
m
@x1 ··· @xn
✓ ✓ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
(
2 @b1 @b1 3 Forward
···
@b
@x1 @xn accumulation
6 .. .. .. 7
=4 . . . 5
@x @b @bm
m
@x1 ··· @xn
✓ ✓ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
(
2 @b1 @b1 3 Forward
···
@b
@x1 @xn accumulation
6 .. .. .. 7
=4 . . . 5
@x @b @bm
m
@x1 ··· @xn
✓✓ ◆ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
✓ ✓ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
(
2 @b1 @b1 3 Forward
···
@b
@x1 @xn accumulation
6 .. .. .. 7
=4 . . . 5
@x @b @bm
m
@x1 ··· @xn
✓✓ ◆ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
(
@y h @y @y
i
= @b1 ··· @bm
@b
✓ ✓ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
(
2 @b1 @b1 3 Forward
···
@b
@x1 @xn accumulation
6 .. .. .. 7
=4 . . . 5
@x @b @bm
m
@x1 ··· @xn
✓✓ ◆ ◆
0 @y @c @b @a
F (x) =
@c @b @a @x
Reverse
(
@y h @y @y
i accumulation
= @b1 ··· @bm
@b
0 @y @c @b @a
F (x) v = v
@c @b @a @x
0 @y @c @b @a
F (x) v = v
@c @b @a @x
✓ ✓ ✓ ◆◆◆
0 @y @c @b @a
F (x) v = v
@c @b @a @x
0 @y @c @b @a
F (x) v = v
@c @b @a @x
✓ ✓ ✓ ◆◆◆
0 @y @c @b @a
F (x) v = v
@c @b @a @x
✓ ✓ ✓ ◆◆◆
0 @y @c @b @a
F (x) v = v
@c @b @a @x
✓ ✓ ✓ ◆◆◆
0 @y @c @b @a @x
F (x) =
@c @b @a @x @x
T 0 v T @y @c @b @a
v F (x) =
@c @b @a @x
T 0 v T @y @c @b @a
v F (x) =
@c @b @a @x
✓✓✓ ◆ ◆ ◆ ◆◆◆
T 0 v T @y @c @b @a
v F (x) =
@c @b @a @x
T 0 v T @y @c @b @a
v F (x) =
@c @b @a @x
✓✓✓ ◆ ◆ ◆ ◆◆◆
T 0 v T @y @c @b @a
v F (x) =
@c @b @a @x
✓✓✓ ◆ ◆ ◆ ◆◆◆
T 0 v T @y @c @b @a
v F (x) =
@c @b @a @x
✓✓✓ ◆ ◆ ◆ ◆◆◆
0 @y @y @c @b @a
F (x) =
@y @c @b @a @x
Forward and reverse accumulation
• Forward accumulation
• Jacobian-vector products
• “push-forward”
• Reverse accumulation
• vector-Jacobian products
• “pull-back”
Fan-in y = F (x1 , x2 )
Non-chain composition
Fan-in y = F (x1 , x2 )
@y 0 @y 0
= F1 (x1 , x2 ) = F2 (x1 , x2 )
@x1 @x2
Non-chain composition
Fan-in y = F (x1 , x2 )
@y 0 @y 0
= F1 (x1 , x2 ) = F2 (x1 , x2 )
@x1 @x2
Fan-out x I
G(x) = = x
x I
Non-chain composition
Fan-in y = F (x1 , x2 )
@y 0 @y 0
= F1 (x1 , x2 ) = F2 (x1 , x2 )
@x1 @x2
Fan-out x I
G(x) = = x
x I
⇥ ⇤
0 I T 0 v1 T v2 T
I
G (x) = v G (x) = = v1 T + v2 T
I I
Tutorial goals
2. Autograd’s implementation
• Fully closed tracing autodiff in Python
autograd.numpy.sum
numpy.sum
primitive
Node ã autograd.numpy.sum
value: a
function: F numpy.sum
parents: [x]
primitive
Node ã autograd.numpy.sum
value: a a
function: F unbox numpy.sum
parents: [x]
primitive
value: a a b value: b
function: F unbox numpy.sum box function: anp.sum
˜
parents: [x] parents: [ã]
class Node(object):
__slots__ = [’value’, ’recipe’, ’progenitors’, ’vspace’]
class primitive(object):
def __call__(self, *args, **kwargs):
argvals = list(args)
progenitors = set()
parents = []
for argnum, arg in enumerate(args):
if isnode(arg):
argvals[argnum] = arg.value
if argnum in self.zero_vjps: continue
parents.append((argnum, arg))
progenitors.update(arg.progenitors & active_progenitors)
class primitive(object):
def __call__(self, *args, **kwargs):
argvals = list(args)
progenitors = set()
parents = []
for argnum, arg in enumerate(args):
if isnode(arg):
argvals[argnum] = arg.value
if argnum in self.zero_vjps: continue
parents.append((argnum, arg))
progenitors.update(arg.progenitors & active_progenitors)
x
start_node
a = A(x)
start_node
x b = B(a)
a = A(x)
start_node
x b = B(a)
a = A(x) c = C(b)
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
start_node end_node
start_node end_node
No control flow!
Autograd’s ingredients
x a = A(x)
@y @y
=?
@x @a
x a = A(x)
@y @y @a @y
= ·
@x @a @x @a
x a = A(x)
@y @y 0 @y
= · A (x)
@x @a @a
x a = A(x)
vector-Jacobian product
@y @y 0 @y
= · A (x)
@x @a @a
x a = A(x)
def forward_pass(fun, args, kwargs, argnum=0):
args = list(args)
start_node = new_progenitor(args[argnum])
args[argnum] = start_node
active_progenitors.add(start_node)
end_node = fun(*args, **kwargs)
active_progenitors.remove(start_node)
return start_node, end_node
3
Autograd’s ingredients
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y
=1
@y
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y
=1
@y
@y
start_node end_node
@c
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y @y
=1
@b @y
@y
start_node end_node
@c
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y @y
=1
@b @y
@y @y
start_node end_node
@a @c
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y @y @y
=1
@x @b @y
@y @y
start_node end_node
@a @c
x b = B(a) y = D(c)
a = A(x) c = C(b)
higher-order autodiff just works:
the backward pass can itself be traced
@y
=1
@y
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y
@c
@y
=1
@y
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y @y
@b @c
@y
=1
@y
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y @y @y
@a @b @c
@y
=1
@y
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y @y @y @y
@x @a @b @c
@y
=1
@y
start_node end_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
@y
end_node =1
@y
start_node
x b = B(a) y = D(c)
a = A(x) c = C(b)
def backward_pass(g, end_node, start_node):
outgrads = {end_node : (g, False)}
assert_vspace_match(outgrads[end_node][0], end_node.vspace, None)
for node in toposort(end_node, start_node):
if node not in outgrads: continue
cur_outgrad = outgrads.pop(node)
function, args, kwargs, parents = node.recipe
for argnum, parent in parents:
outgrad = function.vjp(argnum, cur_outgrad[0], node,
parent.vspace, node.vspace, args, kwargs)
assert_vspace_match(outgrad, parent.vspace, function)
outgrads[parent] = add_outgrads(parent.vspace, outgrads.get(parent),
outgrad)
return cur_outgrad[0]
2. Autograd’s implementation
• Fully closed tracing autodiff in Python
@y
=1
@y
Checkpointing
@y
=1
@y
Checkpointing
@y
=1
@y
Checkpointing
@y
=1
@y
Checkpointing
@y @y
=1
@c @y
Checkpointing
@y
@c
Checkpointing
@y
@c
Checkpointing
@y @y
@b @c
Checkpointing
@y
@b
Checkpointing
Checkpointing
hypergrad_fun = grad_named(adam, ’step_sizes’)
import tensorflow as tf
def checkpoint(fun):
"""Returns a checkpointed version of ‘fun‘, where intermediate values
computed during the forward pass of ‘fun‘ are discarded and then recomputed
for the backward pass. Useful to trade off time and memory."""
def wrapped_grad(argnum, g, ans, vs, gvs, args, kwargs):
return make_vjp(fun, argnum)(*args, **kwargs)[0](g)
wrapped = primitive(fun)
wrapped.vjp = wrapped_grad
return wrapped
mhat = m / (1 - b1**(i + 1))
import tensorflow as tf
import tensorflow as tf
x
def fwd_gradients(ys, xs, d_xs): y
v = tf.placeholder(ys.dtype, shape=ys.get_shape()) # dummy variable
g = tf.gradients(ys, xs, grad_ys=v)
return tf.gradients(g, v, grad_ys=d_xs)
mhat = m / (1 - b1**(i + 1))
import tensorflow as tf
x
def fwd_gradients(ys, xs, d_xs): y
v = tf.placeholder(ys.dtype, shape=ys.get_shape()) # dummy variable
g = tf.gradients(ys, xs, grad_ys=v)
return tf.gradients(g, v, grad_ys=d_xs)
T
J v v
mhat = m / (1 - b1**(i + 1))
import tensorflow as tf
x
def fwd_gradients(ys, xs, d_xs): y
v = tf.placeholder(ys.dtype, shape=ys.get_shape()) # dummy variable
g = tf.gradients(ys, xs, grad_ys=v)
return tf.gradients(g, v, grad_ys=d_xs)
T
J v v
u Ju
mhat = m / (1 - b1**(i + 1))
import tensorflow as tf
x y
T
J v v
u Ju
Solutions, optima, and fixed points
Solutions, optima, and fixed points
⇤
x (a) = arg min f (a, x)
x
⇤
rx (a) = ?
Solutions, optima, and fixed points
⇤
x (a) = arg min f (a, x)
x
⇤
rx (a) = ?
⇤ ⇤ ⇤ 1
rx (a) = ra g(a, x )rx g(a, x )
The implicit function theorem
⇤
g(a, x (a)) = 0
⇤ ⇤ ⇤ 1
rx (a) = ra g(a, x )rx g(a, x )
⇤ ⇤ ⇤ 1
rx (a) = ra g(a, x )rx g(a, x )
@primitive
def fixed_point(f, a, init, converged, max_iter):
update = partial(f, a)
current, prev = update(init), init
for _ in xrange(max_iter):
if converged(current, prev): break
current, prev = update(current), current
else:
print ’fixed point iteration limit reached’
return current
Differentiating fixed points
a
xinit x1 x2 x3 xn 2 xn 1 xn
Differentiating fixed points
a
xinit x1 x2 x3 xn 2 xn 1 xn
n!1
x⇤ = xn = xn 1 = xn 2 = ···
def ggnvp(v): return f_vjp(g_hvp(f_vjp_vjp(v)))
return ggnvp
a
@primitive
def fixed_point(f, a, init, converged, max_iter):
update = partial(f, a)
current, prev = update(init), init
for _ in xrange(max_iter):
if converged(current, prev): break
current, prev = update(current), current
…
else:
xprint
init x1 point
’fixed x3 limit reached’
x2 iteration xn 2 xn 1 xn
return current
fixed_point.defvjp(grad_fixed_point, 1)
Differentiating fixed points