Generalizing gradient descent
Table of Contents
These are my notes from the book Grokking Deep Learning by Andrew Trask. Feel free to check my first post on this book to get my overall thoughts and recommendations on how to approach this series. The rest of my notes for this book can be found here
Gradient descent learning with multiple inputs
 We can use the same technique to update a network with multiple weights.
Example code:
import numpy as np
# # Empty network with multiple inputs
weights = [0.1, 0.2, .1]
def neural_network(my_input, weights):
pred = np.dot(my_input, weights)
return pred
# PREDICT+COMPARE: Making prediction and calc error and delta
toes = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]
win_or_lose_binary = [1, 1, 0, 1]
true = win_or_lose_binary[0]
# my_input corresponds to every entry
# for first game of the season
my_input = [toes[0], wlrec[0], nfans[0]]
pred = neural_network(my_input, weights)
delta = pred  true
error = delta **2
print(f'Prediction: {pred:.2f}')
print(f'Error: {error:.2f}')
print(f'Delta: {delta:.2f}')
Prediction: 0.86
Error: 0.02
Delta: 0.14
# LEARN: Calculate each 'weight delta', put on each weight
# Helper function
def ele_mul(number, vector):
output = [0, 0, 0]
for i in range(len(vector)):
output[i] = number * vector[i]
return output
weight_deltas = ele_mul(delta, my_input)
# Here's what's happening:
# pred[i] * delta = weight_deltas[i]
# 8.5 * 0.14 = 1.19 = weight_deltas[0]
# 0.65 * 0.14 = 0.091 = weight_deltas[1]
# 1.2 * 0.14 = 0.168 = weight_deltas[2]
print(['%.2f' % item for item in weight_deltas])
['1.19', '0.09', '0.17']
# LEARN: Updating the weights
alpha = 0.01
for i in range(len(weights)):
weights[i] = alpha * weight_deltas[i]
print("Weights:" + str(weights))
print("Weight Deltas:" + str(weight_deltas))
# Here's what's happening:
# 0.1  (1.19 * 0.01) = 0.1119 = weights[0]
# 0.2  (.091 * 0.01) = 0.2009 = weights[0]
# 0.1  (.168 * 0.01) = .0098 = weights[0]
Weights:[0.1119, 0.20091, 0.09832]
Weight Deltas:[1.189999999999999, 0.09099999999999994, 0.16799999999999987]
Gradient descent  Multiple weights explanation
Refresher

pred
is calculated by taking the weighted sum, aka dot product of the inputs and their respective weights. 
delta
is a measure of how much we want a node's value to be higher or lower to get error down to zero. Calculated by:delta
=true
pred

weight_delta
is a derivative based estimate for the direction and amount we should move a weight to reduce our node_delta.
Now we have a single delta
, but need to convert into 3 weight_delta
values. We do this by elementwise multiplication between delta
and each input in our my_input
list.
How does this account for stopping, negative reversal, and scaling?
 If input was 0, then weight wouldn't have mattered. (stopping)
 If input was negative, I'd want to decrease my weight. (negative reversal)
 If input was large, then I'd want to move my weight up a lot to compensate. (scaling)
Once we have new weight_deltas
we multiply by alpha, and subtract from the original weight
for each input to update weights across the network.
Example of several steps of learning
# Neural Network
def neural_network(my_input, weights):
out = 0
for i in range(len(my_input)):
out += (my_input[i] * weights[i])
return out
def ele_mul(scalar, vector):
out = [0,0,0]
for i in range(len(out)):
out[i] = vector[i] * scalar
return out
toes = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]
win_or_lose_binary = [1, 1, 0, 1]
true = win_or_lose_binary[0]
alpha = 0.01
weights = [0.1, 0.2, .1]
input = [toes[0],wlrec[0],nfans[0]]
# Run neural network prediction
for iter in range(3):
pred = neural_network(my_input, weights)
error = (pred  true) ** 2
delta = pred  true
weight_deltas = ele_mul(delta, my_input)
print(f'Iteration: {iter+1}')
print(f'Pred: {pred}')
print(f'Error: {error}')
print(f'Weights: {weights}')
print(f'Weight Deltas: {weight_deltas}')
print()
# Update weights based on weight_deltas
for i in range(len(weights)):
weights[i] = alpha * weight_deltas[i]
Iteration: 1
Pred: 0.8600000000000001
Error: 0.01959999999999997
Weights: [0.1, 0.2, 0.1]
Weight Deltas: [1.189999999999999, 0.09099999999999994, 0.16799999999999987]
Iteration: 2
Pred: 0.9637574999999999
Error: 0.0013135188062500048
Weights: [0.1119, 0.20091, 0.09832]
Weight Deltas: [0.30806125000000056, 0.023557625000000044, 0.04349100000000008]
Iteration: 3
Pred: 0.9906177228125002
Error: 8.802712522307997e05
Weights: [0.11498061250000001, 0.20114557625, 0.09788509000000001]
Weight Deltas: [0.07974935609374867, 0.006098480171874899, 0.011258732624999811]
Notice the network becoming more accurate!
Gradient descent  multiple outputs
Instead of predicting just whether the team won or list, we will not predict whether they are happy/sad and the % of team that is hurt. These are the multiple outputs. We will use only the current win/loss record as a single input.
# EMPTY NEURAL NETWORK
weights = [0.3, 0.2, 0.9]
def neural_network(my_input, weights):
pred = ele_mul(my_input,weights)
return pred
# PREDICT
wlrec = [0.65, 1.0, 1.0, 0.9]
hurt = [0.1, 0.0, 0.0, 0.1]
win = [ 1, 1, 0, 1]
sad = [0.1, 0.0, 0.1, 0.2]
my_input = wlrec[0]
true = [hurt[0], win[0], sad[0]]
pred = neural_network(my_input,weights)
error = [0, 0, 0]
delta = [0, 0, 0]
for i in range(len(true)):
error[i] = (pred[i]  true[i]) ** 2
delta[i] = pred[i]  true[i]
# COMPARE: calc each weight_delta, put on each weight
# Helper Function
def scalar_ele_mul(number,vector):
output = [0,0,0]
assert(len(output) == len(vector))
for i in range(len(vector)):
output[i] = number * vector[i]
return output
# LEARN: update the weights
weight_deltas = scalar_ele_mul(my_input,delta)
alpha = 0.1
for i in range(len(weights)):
weights[i] = (weight_deltas[i] * alpha)
print("Weights:" + str(weights))
print("Weight Deltas:" + str(weight_deltas))
Weights:[0.293825, 0.25655, 0.868475]
Weight Deltas:[0.061750000000000006, 0.5655, 0.3152500000000001]
Gradient descent with multiple inputs and outputs
# Empty network with multiple inputs & outputs
#toes %win #fans
weights = [ [0.1, 0.1, 0.3],#hurt?
[0.1, 0.2, 0.0], #win?
[0.0, 1.3, 0.1] ]#sad?
# Helper function
def w_sum(a,b):
assert(len(a) == len(b))
output = 0
for i in range(len(a)):
output += (a[i] * b[i])
return output
# Helper function
def vect_mat_mul(vect, matrix):
assert(len(vect) == len(matrix))
output = [0, 0, 0]
for i in range(len(vect)):
output[i] = w_sum(vect,matrix[i])
return output
def neural_network(my_input, weights):
pred = vect_mat_mul(my_input, weights)
return pred
# PREDICT: Make a prediction & calc error and delta
toes = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65,0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]
hurt = [0.1, 0.0, 0.0, 0.1]
win = [ 1, 1, 0, 1]
sad = [0.1, 0.0, 0.1, 0.2]
alpha = 0.01
my_input = [toes[0], wlrec[0], nfans[0]]
true = [hurt[0], win[0], sad[0]]
pred = neural_network(my_input, weights)
error = [0, 0, 0]
delta = [0, 0, 0]
for i in range(len(true)):
error[i] = (pred[i]  true[i] ** 2)
delta[i] = pred[i]  true[i]
# COMPARE: Calc each 'weight delta' and putting on each weight
import numpy as np
# Helper function
def outer_prod(vec_a, vec_b):
# matrix of zeros
out = np.zeros((len(vec_a), len(vec_b)))
for i in range(len(vec_a)):
for j in range(len(vec_b)):
out[i][j] = vec_a[i] * vec_b[j]
return out
weight_deltas = outer_prod(my_input, delta)
alpha = 0.1
for i in range(len(weights)):
for j in range(len(weights[0])):
weights[i][j] = alpha * weight_deltas[i][j]
# Here's an example of what outer_prod() is doing
outer_prod([1,2,3], [2,4,6])
array([[ 2., 4., 6.],
[ 4., 8., 12.],
[ 6., 12., 18.]])
my_input
[8.5, 0.65, 1.2]
delta
[0.45500000000000007, 0.019999999999999907, 0.8650000000000001]
weights
[[0.28675000000000006, 0.11699999999999992, 1.0352500000000002],
[0.070425, 0.2013, 0.05622500000000002],
[0.05460000000000001, 1.3024, 0.003799999999999998]]