# use package numpy. Call it "np" later in code
import numpy as np
# define sigmoid function
def sigmoid(x):
return 1.0/(1 + np.exp(-x))
# define sigmoid derivative function
# it is a derivative with respect to the sigmoid itself
# here is explanation https://math.stackexchange.com/questions/78575/derivative-of-sigmoid-function-sigma-x-frac11e-x#1225116
def sigmoid_derivative(x):
return x * (1.0 - x)
# define Neural Network class (start)
class NeuralNetwork:
# define constructor of the class
# Constructor is a function which is automatically called every time you create an object of a class
# Every time you create a Neural Network later in code, this function will be called
def __init__(self, x, y):
# input data used for training
self.input = x
# weights between the input layer and the hidden layer
# np.random.rand creates random values. Its arguments are height and width of the random matrix it creates
# self.input.shape is size of the matrix self.input defined above
# self.input.shape produces a list [4, 3]. We need only the value "3" (the size of the input, not the amount of
# different inputs)
# [1] means we take only the value "3". The value "4" has index [0]
# 4 (the second argument below) is the number of hidden nodes
# In the end this generates a random 3x4 matrix
self.weights1 = np.random.rand(self.input.shape[1], 4)
# weights between the hidden layer and the output layer
self.weights2 = np.random.rand(4, 1)
# the ground truth outputs. The "ideal" output we want to get
self.y = y
# the real output we actually get after running the network
# Currently it is empty as we didn't run the network yet, so it is filled with zeros
# This output has to have the same shape as the "ideal" output
self.output = np.zeros(y.shape)
# define feedforward function
# this function takes input data and uses weights from the two layers to produce output values
# the detailed explanation is below
def feedforward(self):
# np.dot() is matrix multiplication
# self.input is a matrix [4x3]
# [ a1 a2 a3]
# [ b1 b2 b3]
# [ c1 c2 c3]
# [ d1 d2 d3]
# every row in self.input is a separate data point (a, b, c, d)
# every data point consists of 3 numbers (in the article: 001, 011, 101, 111)
# self.weights1 is a matrix [3x4]
# [ 1-1 1-2 1-3 1-4]
# [ 2-1 2-2 2-3 2-4]
# [ 3-1 3-2 3-3 3-4]
# numbers A-B represent connection from input node A to a hidden node B
# the result of the multiplication is a matrix [4x4] containing 4 hidden node values for 4 data points
# afterwards this matrix is passed to the sigmoid function
self.layer1 = sigmoid(np.dot(self.input, self.weights1))
# self.layer1 is a matrix [4x4]
# [ 1_a 2_a 3_a 4_a]
# [ 1_b 2_b 3_b 4_b]
# [ 1_c 2_c 3_c 4_c]
# [ 1_d 2_d 3_d 4_d]
# every row is hidden node values for this particular data point
# 1_a = sigmoid(a1 * 1-1 + a2 * 2-1 + a3 * 3-1)
# 2_b = sigmoid(b1 * 2-2 + b2 * 2-2 + b3 * 3-2)
# and so on
#
# self.weights2 is matrix [4x1]
# [w1]
# [w2]
# [w3]
# [w4]
# The result of multiplication here is a matrix [1x4] containing 4 output values:
# [o_a o_b o_c o_d]
# o_a = 1_a * w1 + 2_a * w2 + 3_a * w3 + 4_a * w4
# and the same for other data points, with "a" being replaced by "b", "c" and "d"
# afterwards this matrix is passed to the sigmoid function
self.output = sigmoid(np.dot(self.layer1, self.weights2))
# define backpropagation function
# this function takes output values and uses gradient descend to update weights so feedforward function can
# produce better output values
# the detailed explanation is below
def backprop(self):
# overview:
# We compute loss (squared difference between our output and the "ideal" output)
# Loss = (self.y - self.output)**2 in Python two asterisks (**) mean power operation (^ in MATLAB)
# We want to take derivative of this function. It depends on many variables (input, output, y, weights). We
# only want the derivative with respect to weights.
# Currently weights are inside "self.output", so we need to write out how we get self.output from weights.
# We have this on line 92 of this file
# Loss = (self.y - sigmoid(np.dot(self.layer1, self.weights2)))**2
# Now we are able to change weights2!
# We use chain rule to compute the derivative with respect to weights2
# Chain rule:
# When we have a function h(g(x)) and we want to find its derivative with respect to x, we can write it as
# h(g(x))' * g(x)'
# where
# h(g(x))' is the derivative of h(g), part, but not of the whole h(g(x)) chain
# g(x)' is the derivative of g(x)
# Basically, we take the derivative of every simple function, and multiply them all
# Here we have three functions:
# 1) Square (...)**2
# 2) Sigmoid (self.y - sigmoid(np.dot(...))**2
# 3) Multiplication (...(np.dot(self.layer1, self.weights2)))**2
# By the chain rule, we take all the derivatives and multiply them.
# 1) Square
# The derivative of x**2 is 2*x.
# Remember the h(g) above. Here h(g) is g^2. g is "self.y - self.output"
# h(g(x))', the derivative, is 2*g, or 2*(self.y - self.output)
# All that is left is the derivative of g, which itself consists of several nested functions
# So we start the whole process again, now for a simpler function, "self.y - self.output" instead of
# "(self.y - self.output)**2" we had earlier
# 2) Sigmoid
# Let's call sigmoid function s(x). Its derivative is s(x)*(1 - s(x)) (explanation linked above)
# In our case we want to compute the derivative of sigmoid(np.dot(self.layer1, self.weights2)) or
# self.output.
# Using the expression above, we get
# sigmoid(np.dot(self.layer1, self.weights2)) * (1 - sigmoid(np.dot(self.layer1, self.weights2)))
# But we already know what sigmoid(np.dot(self.layer1, self.weights2)) is! It's just self.output.
# So we can instead use self.output * (1 - self.output)
# We call the function x*(1-x) "sigmoid_derivative", so the expression becomes
# sigmoid_derivative(self.output)
# 3) Multiplication
# For regular multiplication A*x the derivative is simply A.
# We have matrix multiplication, which is a shorthand for a series of multiplications and additions
# (explained above in the feedforward function). Fortunately, the rule stays the same. If A is a matrix
# and x is a vector, the derivative of A*x is still just A.
# We have np.dot(self.layer1, self.weights2), which can be written as
# self.layer1 * self.weights2
# Here self.weights2 is the variable which we use for our derivatives. This makes it the "x" of the
# expression. Then, self.layer1 is the "A" of this expression and the derivative is simply "self.layer1"
# But this is not the end. We need to carefully look at the dimensions of the matrices we use.
# For example, self.output represents values for different data points (a, b, c, d). Which means we want
# to multiply them correctly with values from self.layer1 which also contain values for different data
# points.
# Let's look at the matrices again:
# self.layer1 is a matrix [4x4]
# [ 1_a 2_a 3_a 4_a]
# [ 1_b 2_b 3_b 4_b]
# [ 1_c 2_c 3_c 4_c]
# [ 1_d 2_d 3_d 4_d]
# self.output is a matrix [1x4]
# [ o_a o_b o_c o_d]
# We want to group together all values related to "a", all values related to "b" and so on.
# If we multiply self.layer1 by self.output as is, we get:
# 1_a*o_a + 2_a*o_b + 3_a*o_c + 4_a*o_d
# as the first result. As you can see, self.layer1 only uses "a", and it is mixed with all outputs.
# If we first transpose self.layer1 we get
# [ 1_a 1_b 1_c 1_d]
# [ 2_a 2_b 2_c 2_d]
# [ 3_a 3_b 3_c 3_d]
# [ 4_a 4_b 4_c 4_d]
# which multiplied by self.output gives
# 1_a*o_a + 1_b*o_b + 1_c*o_c + 1_d*o_d
# as the first value.
# You can see that "a", "b", "c" and "d" are nicely grouped together.
# So we use a transposed matrix, self.layer1.T
d_weights2 = np.dot(self.layer1.T,
(2 * (self.y - self.output)
* sigmoid_derivative(self.output)))
# Loss2 = (
# self.y - sigmoid( # 4th function
# np.dot(self.weights2.T, self.layer1 # 3rd function
# ).T
# )
# ) ** 2 # 5th function
# The same process, but with extra steps, happens here.
# The expression is also derived using the chain rule, but in this case self.layer1 is also written out as
# a composite function to get to self.weights1
d_weights1 = np.dot(self.input.T, # 1st
(np.dot(2 * (self.y - self.output) * # 5th
sigmoid_derivative(self.output), # 4th
self.weights2.T) * # 3rd
sigmoid_derivative(self.layer1))) # 2nd
# Loss1 = (
# self.y - sigmoid( # 4th function
# np.dot(self.weights2.T, # 3rd function
# sigmoid( # 2nd function
# np.dot(self.input, self.weights1) # 1st function
# ).T
# ).T
# )
# )**2 # 5th function
# update the weights with the derivative (slope) of the loss function
self.weights1 += d_weights1
# self.weights1 = self.weights1 + d_weights1
self.weights2 += d_weights2
# define Neural Network class (end)
# This section describes the code that executes when we run the program
# Create a [3x4] matrix X as a numpy array and fill it with the stated values
# Brackets are:
# np.array() - call a function with the following arguments
# [...
# ...
# ...] - a list of values
# [0,0,1] - a list of values
# [[0,0,1], [0,1,1]...] makes a list of lists
# Then numpy transforms it into a matrix using np.array function
X = np.array([[0,0,1],
[0,1,1],
[1,0,1],
[1,1,1]])
# Create a [4x1] matrix y.
# Notice that it uses double brackets
# np.array([0,1,1,0]) creates a matrix 1x4 (a row)
# np.array([[0],[1],[1],[0]]) creates a matrix 4x1 (a column)
y = np.array([[0],[1],[1],[0]])
# To use numpy functions like np.dot (matrix multiplication) and to be able to use self.y - self.output for vector
# subtraction we need to use numpy objects (numpy arrays in this case).
# Create a NeuralNetwork object
# It runs function __init__ defined above. The arguments are itself, x and y.
# We define X and y above, now we use them to create a NeuralNetwork object
# The new neural network is called "nn"
nn = NeuralNetwork(X,y)
# In a cycle we repeatedly use feedforward to produce new outputs and backprop to adjust weights
# As we defined these functions as parts of the "NeuralNetwork" class, we need to specify the object when we call them.
# These functions have "self" as their argument. This "self" is a NeuralNetwork object we use to call them.
for i in range(1000):
nn.feedforward()
nn.backprop()
loss = (nn.y - nn.output)**2
print(nn.output)
print(nn.y)
# batch
X2 = np.array([[0,0,0],
[0,1,0]])