1
+ import numpy as np
2
+
3
+ def softmax (x ):
4
+ e_x = np .exp (x - np .max (x ))
5
+ return e_x / e_x .sum (axis = 0 )
6
+
7
+
8
+ def sigmoid (x ):
9
+ return 1 / (1 + np .exp (- x ))
10
+
11
+
12
+ def initialize_adam (parameters ) :
13
+ """
14
+ Initializes v and s as two python dictionaries with:
15
+ - keys: "dW1", "db1", ..., "dWL", "dbL"
16
+ - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
17
+
18
+ Arguments:
19
+ parameters -- python dictionary containing your parameters.
20
+ parameters["W" + str(l)] = Wl
21
+ parameters["b" + str(l)] = bl
22
+
23
+ Returns:
24
+ v -- python dictionary that will contain the exponentially weighted average of the gradient.
25
+ v["dW" + str(l)] = ...
26
+ v["db" + str(l)] = ...
27
+ s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
28
+ s["dW" + str(l)] = ...
29
+ s["db" + str(l)] = ...
30
+
31
+ """
32
+
33
+ L = len (parameters ) // 2 # number of layers in the neural networks
34
+ v = {}
35
+ s = {}
36
+
37
+ # Initialize v, s. Input: "parameters". Outputs: "v, s".
38
+ for l in range (L ):
39
+ ### START CODE HERE ### (approx. 4 lines)
40
+ v ["dW" + str (l + 1 )] = np .zeros (parameters ["W" + str (l + 1 )].shape )
41
+ v ["db" + str (l + 1 )] = np .zeros (parameters ["b" + str (l + 1 )].shape )
42
+ s ["dW" + str (l + 1 )] = np .zeros (parameters ["W" + str (l + 1 )].shape )
43
+ s ["db" + str (l + 1 )] = np .zeros (parameters ["b" + str (l + 1 )].shape )
44
+ ### END CODE HERE ###
45
+
46
+ return v , s
47
+
48
+
49
+ def update_parameters_with_adam (parameters , grads , v , s , t , learning_rate = 0.01 ,
50
+ beta1 = 0.9 , beta2 = 0.999 , epsilon = 1e-8 ):
51
+ """
52
+ Update parameters using Adam
53
+
54
+ Arguments:
55
+ parameters -- python dictionary containing your parameters:
56
+ parameters['W' + str(l)] = Wl
57
+ parameters['b' + str(l)] = bl
58
+ grads -- python dictionary containing your gradients for each parameters:
59
+ grads['dW' + str(l)] = dWl
60
+ grads['db' + str(l)] = dbl
61
+ v -- Adam variable, moving average of the first gradient, python dictionary
62
+ s -- Adam variable, moving average of the squared gradient, python dictionary
63
+ learning_rate -- the learning rate, scalar.
64
+ beta1 -- Exponential decay hyperparameter for the first moment estimates
65
+ beta2 -- Exponential decay hyperparameter for the second moment estimates
66
+ epsilon -- hyperparameter preventing division by zero in Adam updates
67
+
68
+ Returns:
69
+ parameters -- python dictionary containing your updated parameters
70
+ v -- Adam variable, moving average of the first gradient, python dictionary
71
+ s -- Adam variable, moving average of the squared gradient, python dictionary
72
+ """
73
+
74
+ L = len (parameters ) // 2 # number of layers in the neural networks
75
+ v_corrected = {} # Initializing first moment estimate, python dictionary
76
+ s_corrected = {} # Initializing second moment estimate, python dictionary
77
+
78
+ # Perform Adam update on all parameters
79
+ for l in range (L ):
80
+ # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
81
+ ### START CODE HERE ### (approx. 2 lines)
82
+ v ["dW" + str (l + 1 )] = beta1 * v ["dW" + str (l + 1 )] + (1 - beta1 ) * grads ["dW" + str (l + 1 )]
83
+ v ["db" + str (l + 1 )] = beta1 * v ["db" + str (l + 1 )] + (1 - beta1 ) * grads ["db" + str (l + 1 )]
84
+ ### END CODE HERE ###
85
+
86
+ # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
87
+ ### START CODE HERE ### (approx. 2 lines)
88
+ v_corrected ["dW" + str (l + 1 )] = v ["dW" + str (l + 1 )] / (1 - beta1 ** t )
89
+ v_corrected ["db" + str (l + 1 )] = v ["db" + str (l + 1 )] / (1 - beta1 ** t )
90
+ ### END CODE HERE ###
91
+
92
+ # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
93
+ ### START CODE HERE ### (approx. 2 lines)
94
+ s ["dW" + str (l + 1 )] = beta2 * s ["dW" + str (l + 1 )] + (1 - beta2 ) * (grads ["dW" + str (l + 1 )] ** 2 )
95
+ s ["db" + str (l + 1 )] = beta2 * s ["db" + str (l + 1 )] + (1 - beta2 ) * (grads ["db" + str (l + 1 )] ** 2 )
96
+ ### END CODE HERE ###
97
+
98
+ # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
99
+ ### START CODE HERE ### (approx. 2 lines)
100
+ s_corrected ["dW" + str (l + 1 )] = s ["dW" + str (l + 1 )] / (1 - beta2 ** t )
101
+ s_corrected ["db" + str (l + 1 )] = s ["db" + str (l + 1 )] / (1 - beta2 ** t )
102
+ ### END CODE HERE ###
103
+
104
+ # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
105
+ ### START CODE HERE ### (approx. 2 lines)
106
+ parameters ["W" + str (l + 1 )] = parameters ["W" + str (l + 1 )] - learning_rate * v_corrected ["dW" + str (l + 1 )] / np .sqrt (s_corrected ["dW" + str (l + 1 )] + epsilon )
107
+ parameters ["b" + str (l + 1 )] = parameters ["b" + str (l + 1 )] - learning_rate * v_corrected ["db" + str (l + 1 )] / np .sqrt (s_corrected ["db" + str (l + 1 )] + epsilon )
108
+ ### END CODE HERE ###
109
+
110
+ return parameters , v , s
0 commit comments