Generalizing gradient descent

Published January 14, 2019

These are my notes from the book Grokking Deep Learning by Andrew Trask. Feel free to check my first post on this book to get my overall thoughts and recommendations on how to approach this series. The rest of my notes for this book can be found here

Gradient descent learning with multiple inputs

  • We can use the same technique to update a network with multiple weights.

Example code:

import numpy as np

# # Empty network with multiple inputs
weights = [0.1, 0.2, -.1] 
def neural_network(my_input, weights):
    pred = np.dot(my_input, weights)
    return pred


# PREDICT+COMPARE: Making prediction and calc error and delta
toes =  [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]

win_or_lose_binary = [1, 1, 0, 1]
true = win_or_lose_binary[0]

# my_input corresponds to every entry
# for first game of the season

my_input = [toes[0], wlrec[0], nfans[0]]
pred = neural_network(my_input, weights)
delta = pred - true
error = delta **2

print(f'Prediction: {pred:.2f}')
print(f'Error: {error:.2f}')
print(f'Delta: {delta:.2f}')
Prediction: 0.86
Error: 0.02
Delta: -0.14
# LEARN: Calculate each 'weight delta', put on each weight

# Helper function
def ele_mul(number, vector):
    output = [0, 0, 0]
    for i in range(len(vector)):
        output[i] = number * vector[i]
    return output

weight_deltas = ele_mul(delta, my_input)

# Here's what's happening:
# pred[i] * delta = weight_deltas[i]
# 8.5     * -0.14 = -1.19  = weight_deltas[0]
# 0.65    * -0.14 = -0.091 = weight_deltas[1]
# 1.2     * -0.14 = -0.168 = weight_deltas[2]

print(['%.2f' % item for item in weight_deltas])
['-1.19', '-0.09', '-0.17']
# LEARN: Updating the weights
alpha = 0.01

for i in range(len(weights)):
    weights[i] -= alpha * weight_deltas[i]
    
print("Weights:" + str(weights))
print("Weight Deltas:" + str(weight_deltas))

# Here's what's happening:
#  0.1 - (-1.19 * 0.01) = 0.1119 = weights[0]
#  0.2 - (-.091 * 0.01) = 0.2009 = weights[0]
# -0.1 - (-.168 * 0.01) = -.0098 = weights[0]
Weights:[0.1119, 0.20091, -0.09832]
Weight Deltas:[-1.189999999999999, -0.09099999999999994, -0.16799999999999987]

Gradient descent - Multiple weights explanation

Refresher

  • pred is calculated by taking the weighted sum, aka dot product of the inputs and their respective weights.

  • delta is a measure of how much we want a node’s value to be higher or lower to get error down to zero. Calculated by: delta = true - pred

  • weight_delta is a derivative based estimate for the direction and amount we should move a weight to reduce our node_delta.

Now we have a single delta, but need to convert into 3 weight_delta values. We do this by elementwise multiplication between delta and each input in our my_input list.

How does this account for stopping, negative reversal, and scaling?

  • If input was 0, then weight wouldn’t have mattered. (stopping)
  • If input was negative, I’d want to decrease my weight. (negative reversal)
  • If input was large, then I’d want to move my weight up a lot to compensate. (scaling)

Once we have new weight_deltas we multiply by alpha, and subtract from the original weight for each input to update weights across the network.

Example of several steps of learning

# Neural Network
def neural_network(my_input, weights):
  out = 0
  for i in range(len(my_input)):
    out += (my_input[i] * weights[i])
  return out

def ele_mul(scalar, vector):
  out = [0,0,0]
  for i in range(len(out)):
    out[i] = vector[i] * scalar
  return out

toes =  [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]

win_or_lose_binary = [1, 1, 0, 1]
true = win_or_lose_binary[0]

alpha = 0.01
weights = [0.1, 0.2, -.1]
input = [toes[0],wlrec[0],nfans[0]]

# Run neural network prediction
for iter in range(3):
    pred = neural_network(my_input, weights)
    error = (pred - true) ** 2
    delta = pred - true
    weight_deltas = ele_mul(delta, my_input)
    
    print(f'Iteration: {iter+1}')
    print(f'Pred: {pred}')
    print(f'Error: {error}')
    print(f'Weights: {weights}')
    print(f'Weight Deltas: {weight_deltas}')
    print()
    
    # Update weights based on weight_deltas
    for i in range(len(weights)):
        weights[i] -= alpha * weight_deltas[i]
Iteration: 1
Pred: 0.8600000000000001
Error: 0.01959999999999997
Weights: [0.1, 0.2, -0.1]
Weight Deltas: [-1.189999999999999, -0.09099999999999994, -0.16799999999999987]

Iteration: 2
Pred: 0.9637574999999999
Error: 0.0013135188062500048
Weights: [0.1119, 0.20091, -0.09832]
Weight Deltas: [-0.30806125000000056, -0.023557625000000044, -0.04349100000000008]

Iteration: 3
Pred: 0.9906177228125002
Error: 8.802712522307997e-05
Weights: [0.11498061250000001, 0.20114557625, -0.09788509000000001]
Weight Deltas: [-0.07974935609374867, -0.006098480171874899, -0.011258732624999811]

Notice the network becoming more accurate!

Gradient descent - multiple outputs

Instead of predicting just whether the team won or list, we will not predict whether they are happy/sad and the % of team that is hurt. These are the multiple outputs. We will use only the current win/loss record as a single input.

# EMPTY NEURAL NETWORK
weights = [0.3, 0.2, 0.9] 

def neural_network(my_input, weights):
    pred = ele_mul(my_input,weights)
    return pred

# PREDICT
wlrec = [0.65, 1.0, 1.0, 0.9]

hurt  = [0.1, 0.0, 0.0, 0.1]
win   = [  1,   1,   0,   1]
sad   = [0.1, 0.0, 0.1, 0.2]

my_input = wlrec[0]
true = [hurt[0], win[0], sad[0]]

pred = neural_network(my_input,weights)

error = [0, 0, 0] 
delta = [0, 0, 0]

for i in range(len(true)):
    error[i] = (pred[i] - true[i]) ** 2
    delta[i] = pred[i] - true[i]
    
# COMPARE: calc each weight_delta, put on each weight
# Helper Function
def scalar_ele_mul(number,vector):
    output = [0,0,0]

    assert(len(output) == len(vector))

    for i in range(len(vector)):
        output[i] = number * vector[i]

    return output

# LEARN: update the weights
weight_deltas = scalar_ele_mul(my_input,delta)

alpha = 0.1

for i in range(len(weights)):
    weights[i] -= (weight_deltas[i] * alpha)
    
print("Weights:" + str(weights))
print("Weight Deltas:" + str(weight_deltas))
Weights:[0.293825, 0.25655, 0.868475]
Weight Deltas:[0.061750000000000006, -0.5655, 0.3152500000000001]

Gradient descent with multiple inputs and outputs

# Empty network with multiple inputs & outputs

            #toes %win #fans
weights = [ [0.1, 0.1, -0.3],#hurt?
            [0.1, 0.2, 0.0], #win?
            [0.0, 1.3, 0.1] ]#sad?

# Helper function
def w_sum(a,b):
    assert(len(a) == len(b))
    output = 0
    for i in range(len(a)):
        output += (a[i] * b[i])
    return output

# Helper function
def vect_mat_mul(vect, matrix):
    assert(len(vect) == len(matrix))
    output = [0, 0, 0]
    for i in range(len(vect)):
        output[i] = w_sum(vect,matrix[i])
    return output

def neural_network(my_input, weights):
    pred = vect_mat_mul(my_input, weights)
    return pred

# PREDICT: Make a prediction & calc error and delta
toes  = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65,0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]

hurt  = [0.1, 0.0, 0.0, 0.1]
win   = [  1,   1,   0,   1]
sad   = [0.1, 0.0, 0.1, 0.2]

alpha = 0.01

my_input = [toes[0], wlrec[0], nfans[0]]
true = [hurt[0], win[0], sad[0]]

pred = neural_network(my_input, weights)

error = [0, 0, 0]
delta = [0, 0, 0]

for i in range(len(true)):
    error[i] = (pred[i] - true[i] ** 2)
    delta[i] = pred[i] - true[i]
    
# COMPARE: Calc each 'weight delta' and putting on each weight
import numpy as np

# Helper function
def outer_prod(vec_a, vec_b):
    # matrix of zeros
    out = np.zeros((len(vec_a), len(vec_b)))
    
    for i in range(len(vec_a)):
        for j in range(len(vec_b)):
            out[i][j] = vec_a[i] * vec_b[j]
    return out

weight_deltas = outer_prod(my_input, delta)
alpha = 0.1

for i in range(len(weights)):
    for j in range(len(weights[0])):
        weights[i][j] -= alpha * weight_deltas[i][j]
# Here's an example of what outer_prod() is doing
outer_prod([1,2,3], [2,4,6])
array([[ 2.,  4.,  6.],
       [ 4.,  8., 12.],
       [ 6., 12., 18.]])
my_input
[8.5, 0.65, 1.2]
delta
[0.45500000000000007, -0.019999999999999907, 0.8650000000000001]
weights
[[-0.28675000000000006, 0.11699999999999992, -1.0352500000000002],
 [0.070425, 0.2013, -0.05622500000000002],
 [-0.05460000000000001, 1.3024, -0.003799999999999998]]