This is the second part of the assignment. You will get to know how to build basic fully connected neural network.
# Import modules
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time
import os
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')
from utils.cifar_utils import load_data
# Plot configurations
%matplotlib inline
# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2
# Load the raw CIFAR-10 data.
X_train, y_train, X_test, y_test = load_data()
# Data organizations:
# Train data: 49000 samples from original train set: 1~49,000
# Validation data: 1000 samples from original train set: 49,000~50,000
# Test data: 10000 samples from original test set: 1~10,000
# Development data (for gradient check): 100 from the train set: 1~49,000
num_training = 49000
num_validation = 1000
num_dev = 100
X_val = X_train[-num_validation:, :]
y_val = y_train[-num_validation:]
mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]
X_train = X_train[:num_training, :]
y_train = y_train[:num_training]
# Preprocessing: subtract the mean value across every dimension for training data
mean_image = np.mean(X_train, axis=0)
X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_val = X_val.astype(np.float32) - mean_image
X_test = X_test.astype(np.float32) - mean_image
X_dev = X_dev.astype(np.float32) - mean_image
print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('Development data shape:', X_dev.shape)
print('Development data shape', y_dev.shape)
(49000, 3072) (1000, 3072) (10000, 3072) (100, 3072) Train data shape: (49000, 3072) Train labels shape: (49000,) Validation data shape: (1000, 3072) Validation labels shape: (1000,) Test data shape: (10000, 3072) Test labels shape: (10000,) Development data shape: (100, 3072) Development data shape (100,)
from builtins import range
import numpy as np
def affine_forward(x, w, b):
"""
Computes the forward pass for an affine function.
The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
examples, where each example x[i] has shape (d_1, ..., d_k). We will
reshape each input into a vector of dimension D = d_1 * ... * d_k, and
then transform it to an output vector of dimension M.
Inputs:
- x: a numpy array containing input data, of shape (N, d_1, ..., d_k)
- w: a numpy array of weights, of shape (D, M)
- b: a numpy array of biases, of shape (M,)
Returns a tuple of:
- out: output, of shape (N, M)
"""
# Implement the affine forward pass. Store the result in 'out'. You
# will need to reshape the input into rows.
x = x.reshape(x.shape[0],-1)
out = np.matmul(x,w) + b
return out
def affine_backward(dout, x, w, b):
"""
Computes the backward pass of an affine function.
Inputs:
- dout: upstream derivative, of shape (N, M)
- x: input data, of shape (N, d_1, ... d_k)
- w: weights, of shape (D, M)
- b: bias, of shape (M,)
Returns a tuple of:
- dx: gradient with respect to x, of shape (N, d1, ..., d_k)
- dw: gradient with respect to w, of shape (D, M)
- db: gradient with respect to b, of shape (M,)
"""
dx = None #initialize
dw = None
db = None
dx = np.matmul(dout, w.T).reshape(x.shape)
dw = np.matmul(x.reshape(x.shape[0], -1).T, dout)
db = np.sum(dout, axis=0)
return dx, dw, db
NOTE: Please do not change the code in the cell below, The cell below will run correctly if your code is right.
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
from utils.layer_funcs import affine_forward
from utils.layer_funcs import affine_backward
# generate data for checking
x = X_dev
w = np.random.rand(x.shape[1],100)
b = np.random.rand(100)
dout = np.ones((x.shape[0],100))
## Affine function -- H = W*X + b
out = affine_forward(x, w, b)
dx, dw, db = affine_backward(dout, x, w, b)
## check by tfe.gradients_function()
x_tf = tf.Variable(x, name='x')
w_tf = tf.Variable(w, name='w')
b_tf = tf.Variable(b, name='b')
def affine_layer(x, w, b):
return tf.matmul(x, w) + b
with tf.GradientTape() as tape:
tape.watch(w_tf)
out_tf = affine_layer(x_tf, w_tf, b_tf)
dx_tf, dw_tf, db_tf = tape.gradient(out_tf, (x_tf, w_tf, b_tf))
out_check = out_tf.numpy()
dx_check, dw_check, db_check = dx_tf.numpy(), dw_tf.numpy(), db_tf.numpy()
## Print validation result
print("Is out correct? {}".format(np.allclose(out, out_check)))
print("Is dx correct? {}".format(np.allclose(dx, dx_check)))
print("Is dw correct? {}".format(np.allclose(dw, dw_check)))
print("Is db correct? {}".format(np.allclose(db, db_check)))
Is out correct? True Is dx correct? True Is dw correct? True Is db correct? True
Complete functions relu_forward, relu_backward
def relu_forward(x):
"""
Computes the forward pass for rectified linear units (ReLUs).
Input:
- x: inputs, of any shape
Returns a tuple of:
- out: output, of the same shape as x
"""
# Implement the ReLU forward pass.
out = np.maximum(0, x)
return out
def relu_backward(dout, x):
"""
Computes the backward pass for rectified linear units (ReLUs).
Input:
- dout: upstream derivatives, of any shape
Returns:
- dx: gradient with respect to x
"""
# Implement the ReLU backward pass.
relu_grad = x > 0 #boolean
dx = dout * relu_grad
return dx
NOTE: Please do not change the code in the cell below, The cell below will run correctly if your code is right.
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
from utils.layer_funcs import relu_forward
from utils.layer_funcs import relu_backward
## Activation layers -- Here we introduce ReLU activation function
## since it is the most commonly used in computer vision problems.
## However, you can also try to implement
## other activation functions like sigmoid, tanh etc.
x = X_dev
dout = np.ones(x.shape)
## ReLU
out = relu_forward(x)
dx = relu_backward(dout, x)
## check by tf.GradientTape.gradients()
x_tf = tf.Variable(x, name='x')
with tf.GradientTape() as tape:
tape.watch(x_tf)
out_tf = tf.nn.relu(x_tf)
grad_gt = tape.gradient(out_tf, x_tf)
out_check = out_tf.numpy()
dx_check = grad_gt.numpy()
## Print validation result
print("Is out correct? {}".format(np.allclose(out, out_check)))
print("Is dx correct? {}".format(np.allclose(dx, dx_check)))
Is out correct? True Is dx correct? True
Complete functions softmax_loss
# Stable
# def softmax(X):
# exps = np.exp(X - np.max(X))
# return exps / np.sum(exps)
def softmax(z):
assert len(z.shape) == 2 #Test if a condition returns True
s = np.max(z, axis=1)
s = s[:, np.newaxis] # (z.shape[0], 1)
e_x = np.exp(z - s) # improve the numerical stability
# see https://stats.stackexchange.com/questions/338285/how-does-the-subtraction-of-the-logit-maximum-improve-learning
div = np.sum(e_x, axis=1)
div = div[:, np.newaxis]
return e_x / div
def softmax_loss(x, y):
"""
Softmax loss function, vectorized version.
y_prediction = argmax(softmax(x))
Inputs:
- x: (float) a tensor of shape (N, #classes)
- y: (int) ground truth label, a array of length N
Returns:
- loss: the cross-entropy loss
- dx: gradients wrt input x
"""
# Initialize the loss.
loss = 0.0
dx = np.zeros_like(x)
# You can use the previous softmax loss function here.
# Hint: Be careful on overflow problem
# First transform the y into onehot
# num_classes = x.shape[1]
# one_hot_y = np.zeros(x.shape[0], num_classes)
# for idx, item in enumerate(y):
# one_hot_y[idx, item] = 1
# Calculate the softmax of x
prob = softmax(x)
# print("prob: ", prob)
log_likelihood = -np.log(prob[range(x.shape[0]),y])
loss = np.sum(log_likelihood) / x.shape[0]
prob_2 = softmax(x)
prob_2[range(x.shape[0]),y] -= 1
dx = prob_2/x.shape[0]
return loss, dx
NOTE: Please do not change the code in the cell below, The cell below will run correctly if your code is right.
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
from utils.layer_funcs import softmax_loss
## generate some random data for testing
x = np.random.rand(100,10)
y = np.argmax(x, axis=1)
loss, dx = softmax_loss(x, y)
## check by tf.GradientTape.gradients()
x_tf = tf.Variable(x, name='x')
y_tf = tf.Variable(y, name='y')
with tf.GradientTape() as tape:
tape.watch(x_tf)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= x_tf, labels=tf.one_hot(y_tf,10))
loss_tf = tf.reduce_mean(cross_entropy)
dx_tf = tape.gradient(loss_tf, x_tf)
loss_check = loss_tf.numpy()
dx_check = dx_tf.numpy()
## Print validation result
print("Is loss correct? {}".format(np.allclose(loss, loss_check)))
print("Is dx correct? {}".format(np.allclose(dx, dx_check)))
Is loss correct? True Is dx correct? True
Now try to combine an affine function and a nonlinear activation function into a single fully-connected layer. Edit the code in ./utils/layer_utils.py
$$\mathbf{O} = activation(\mathbf{W} \times \mathbf{X} + \mathbf{b})$$For this assignment, you need to create two types of layers as below. You can get started with the skeleton code in ./utils/layer_utils.py. The basic class structure has been provided, and you need to fill in the "TODO" part(s).
Class DenseLayer:
Variables: weights, bias
Functions:
__init__: given (input_dim, output_dim, weight_scale)
feedforward: TODO
backforward: TODO
Complete function AffineLayer in ./utils/layer_utils.py
class FullyConnectedLayer(object):
def reset_layer(self, weight_scale=1e-2):
"""
Reset weight and bias.
Inputs:
- weight_scale: (float) define the scale of weights
"""
input_dim = self.input_dim
hidden_dim = self.output_dim
W = np.random.rand(input_dim, output_dim)
b = np.zeros(output_dim)
self.params = [W, b]
def update_layer(self, params):
"""
Update weight and bias
"""
self.params = params
class AffineLayer(FullyConnectedLayer):
"""
An affine hidden layer performs an affine transform.
"""
def __init__(self, input_dim, output_dim=100, weight_scale=1e-2):
"""
Initialize weight W with random value and
bias b with zero.
Inputs:
- input_dim: (int) the number of input neurons,
like D or D1xD2x...xDn.
- output_dim: (int) the number of hidden neurons in this layer
"""
self.input_dim = input_dim
self.output_dim = output_dim
W = weight_scale*np.random.rand(input_dim, output_dim)
b = np.zeros(output_dim)
self.params = [W, b]
def feedforward(self, X):
"""
Inputs:
- X: (float) a tensor of shape (N,D) or
(N, D1, D2, ..., Dn).
Returns:
- out: output of shape (N, hidden_dim)
"""
W, b = self.params
################################################
# out = X*W + b.
# Use functions in layer_funcs.py
self.X = X
out = affine_forward(x = X, w = W, b = b)
# print(out)
return out
def backward(self, dout):
"""
Inputs:
- dout: (float) a tensor with shape (N, hidden_dim)
Here hidden_dim denotes the number of hidden
neurons
Returns:
- dX: gradients wrt intput X, shape (N, D)
- dW: gradients wrt W, shape (D, hidden_dim)
- db: gradients wrt b, length hidden_dim
"""
W, b = self.params
X = self.X
dX = np.zeros_like(X)
dW = np.zeros_like(W)
db = np.zeros_like(b)
################################################
# derive the gradients wrt to X, W, b #
# Use layer_funcs.py. #
dX, dW, db = affine_backward(dout = dout, x = X, w = W, b = b)
self.gradients = [dW, db]
return dX
NOTE: Please do not change the code in the cell below, The cell below will run correctly if your code is right.
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
from utils.layer_utils import AffineLayer
## Affine
test_affine = AffineLayer(input_dim=3072,output_dim=100)
w, b = test_affine.params
## Data for correctness check
x = X_dev
dout = np.ones((x.shape[0], 100))
out = test_affine.feedforward(x)
dx = test_affine.backward(dout)
dw, db = test_affine.gradients
## check by tf.GradientTape.gradients()
x_tf = tf.Variable(x, name='x')
w_tf = tf.Variable(w, name='w')
b_tf = tf.Variable(b, name='b')
def affine_layer(x, w, b):
return tf.matmul(x, w) + b
with tf.GradientTape() as tape:
tape.watch(w_tf)
out_tf = affine_layer(x_tf, w_tf, b_tf)
dx_tf, dw_tf, db_tf = tape.gradient(out_tf, (x_tf, w_tf, b_tf))
out_check = out_tf.numpy()
dx_check = dx_tf.numpy()
dw_check = dw_tf.numpy()
db_check = db_tf.numpy()
## Print validation result
print("Is out correct? {}".format(np.allclose(out, out_check)))
print("Is dx correct? {}".format(np.allclose(dx, dx_check)))
print("Is dw correct? {}".format(np.allclose(dw, dw_check)))
print("Is db correct? {}".format(np.allclose(db, db_check)))
Is out correct? True Is dx correct? True Is dw correct? True Is db correct? True
Complete function DenseLayer
class DenseLayer(FullyConnectedLayer):
"""
A dense hidden layer performs an affine transform followed by ReLU.
Here we use ReLU as default activation function.
"""
def __init__(self, input_dim, output_dim=100, weight_scale=1e-2):
"""
Initialize weight W with random value and
bias b with zero.
Inputs:
- input_dim: (int) the number of input neurons,
like D or D1xD2x...xDn.
- output_dim: (int) the number of hidden neurons
in this layer
"""
self.input_dim = input_dim
self.output_dim = output_dim
W = weight_scale*np.random.rand(input_dim, output_dim)
b = np.zeros(output_dim)
self.params = [W, b]
def feedforward(self, X):
"""
Inputs:
- X: (float) a tensor of shape (N,D) or
(N, D1, D2, ..., Dn).
Returns:
- out: output of shape (N, output_dim)
"""
################################################
# out = ReLU(X*W + b). Use functions in #
# layer_funcs.py #
# Output of affine_forward is named as A
# Output of relu_forward is named as out
self.X = X
W, b = self.params
A = affine_forward(x = X, w = W, b = b)
self.A = A
out = relu_forward(A)
return out
def backward(self, dout):
"""
Inputs:
- dout: (float) a tensor with shape (N, hidden_dim)
Returns:
- dX: gradients wrt intput X, shape (N, D)
- dW: gradients wrt W, shape (D, hidden_dim)
- db: gradients wrt b, length hidden_dim
"""
W, b = self.params
X = self.X # cache input data
A = self.A # cache intermediate affine result
dX = np.zeros_like(X)
dW = np.zeros_like(W)
db = np.zeros_like(b)
################################################
# derive the gradients wst to X, W, b #
# Use layer_funcs.py. #
################################################
step_1 = relu_backward(dout = dout, x = A)
dX, dW, db = affine_backward(dout = step_1, x = X, w = W, b = b)
self.gradients = [dW, db]
return dX
NOTE: Please do not change the code in the cell below, The cell below will run correctly if your code is right.
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
## First, let's make a dense layer
from utils.layer_utils import DenseLayer
## Affine + ReLU
test_dense = DenseLayer(input_dim=3072,output_dim=100)
w, b = test_dense.params
## Data for correctness check
x = X_dev
dout = np.ones((x.shape[0], 100))
out = test_dense.feedforward(x)
dx = test_dense.backward(dout)
dw, db = test_dense.gradients
## check by tf.GradientTape.gradients()
x_tf = tf.Variable(x, name='x')
w_tf = tf.Variable(w, name='w')
b_tf = tf.Variable(b, name='b')
def dense_layer(x, w, b):
return tf.nn.relu(tf.matmul(x, w) + b)
with tf.GradientTape() as tape:
tape.watch(w_tf)
out_tf = dense_layer(x_tf, w_tf, b_tf)
dx_tf, dw_tf, db_tf = tape.gradient(out_tf, (x_tf, w_tf, b_tf))
out_check = out_tf.numpy()
dx_check = dx_tf.numpy()
dw_check = dw_tf.numpy()
db_check = db_tf.numpy()
## Print validation result
print("Is out correct? {}".format(np.allclose(out, out_check)))
print("Is dx correct? {}".format(np.allclose(dx, dx_check)))
print("Is dw correct? {}".format(np.allclose(dw, dw_check)))
print("Is db correct? {}".format(np.allclose(db, db_check)))
Is out correct? True Is dx correct? True Is dw correct? True Is db correct? True
Complete the class TwoLayerNet in ./utils/classifiers/twolayernet.py. Through this experiment, you will create a two-layer neural network and learn about the backpropagation mechanism. The network structure is like input >> DenseLayer >> AffineLayer >> softmax loss >> output. Complete "TODO" part(s).
Class TwoLayerNet:
Functions:
__init__: GIVEN
loss: TODO - calculate cross entropy loss and gradients wst all weights and bias.
step: TODO - a single update all weights and bias by SGD.
predict: TODO - output result(classification accuracy) based on input data
Variables:
layers
TODO: Complete class TwoLayerNet in ./utils/classifiers/twolayernet.py
from __future__ import print_function
import numpy as np
from utils.layer_funcs import *
from utils.layer_utils import *
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network. The net has an input dimension of
N, a hidden layer dimension of H, and performs classification over C classes.
We train the network with a softmax loss function and L2 regularization on the
weight matrices. The network uses a ReLU nonlinearity after the first fully
connected layer.
In other words, the network has the following architecture:
input -> DenseLayer -> AffineLayer -> softmax loss -> output
Or more detailed,
input -> affine transform -> ReLU -> affine transform -> softmax -> output
The outputs of the second fully-connected layer are the scores for each class.
"""
def __init__(self, input_dim=3072, hidden_dim=200, num_classes=10, reg=0.0, weight_scale=1e-3):
"""
Inputs:
- reg: (float) L2 regularization
- weight_scale: (float) for layer weight initialization
"""
self.layer1 = DenseLayer(input_dim, hidden_dim, weight_scale=weight_scale)
self.layer2 = AffineLayer(hidden_dim, num_classes, weight_scale=weight_scale)
self.reg = reg
self.velocities = None
def loss(self, X, y):
"""
Calculate the cross-entropy loss and then use backpropogation
to get gradients wst W,b in each layer.
Inputs:
- X: input data
- y: ground truth
Return loss value(float)
"""
loss = 0.0
reg = self.reg
###################################################
# Feedforward #
res_1 = self.layer1.feedforward(X)
res_2 = self.layer2.feedforward(res_1)
# print(res_2.shape)
loss, dx = softmax_loss(x = res_2, y = y)
# print(loss.shape)
# Backpropogation,here is just one dense
# layer, it should be pretty easy
dx = self.layer2.backward(dx)
dx = self.layer1.backward(dx)
# Add L2 regularization
square_weights = np.sum(self.layer1.params[0]**2) + np.sum(self.layer2.params[0]**2)
loss += 0.5*self.reg*square_weights
return loss
def step(self, learning_rate=1e-5, optim='SGD', momentum=0.5):
"""
Use SGD to implement a single-step update to each weight and bias.
"""
# creates new lists with all parameters and gradients
layer1, layer2 = self.layer1, self.layer2
params = layer1.params + layer2.params
grads = layer1.gradients + layer2.gradients
if self.velocities is None:
self.velocities = [np.zeros_like(param) for param in params]
# Add L2 regularization
reg = self.reg
grads = [grad + reg*params[i] for i, grad in enumerate(grads)]
###################################################
#Use SGD or SGD with momentum to update
# variables in layer1 and layer2.
# print(len(grads))
# Without momentum
# for idx, _ in enumerate(params):
# params[idx] -= learning_rate * grads[idx]
# With momentum #preserve gradient information
for idx, _ in enumerate(params):
self.velocities[idx] = momentum * self.velocities[idx] + learning_rate * grads[idx]
params[idx] = params[idx] - self.velocities[idx]
# update parameters in layers
layer1.update_layer(params[0:2])
layer2.update_layer(params[2:4])
# Python Reference, actually there is no need for this step.
# Here add these two lines to make code more clear.
self.layer1 = layer1
self.layer2 = layer2
def predict(self, X):
"""
Return the label prediction of input data
Inputs:
- X: (float) a tensor of shape (N, D)
Returns:
- predictions: (int) an array of length N
"""
predictions = None
layer1, layer2 = self.layer1, self.layer2
###################################################
# Remember to use functions in class #
# SoftmaxLayer. #
res_1 = self.layer1.feedforward(X)
res_2 = self.layer2.feedforward(res_1)
sft = softmax(res_2)
predictions = np.argmax(sft,axis = 1)
# print(out)
# print(sft.shape)
return predictions
def check_accuracy(self, X, y):
"""
Return the classification accuracy of input data
Inputs:
- X: (float) a tensor of shape (N, D)
- y: (int) an array of length N. ground truth label
Returns:
- acc: (float) between 0 and 1
"""
y_pred = self.predict(X)
acc = np.mean(np.equal(y, y_pred))
return acc
def save_model(self):
"""
Save model's parameters, including two layer's W and b and reg
"""
return [self.layer1.params, self.layer2.params, self.reg]
def update_model(self, new_params):
"""
Update layers and reg with new parameters
"""
layer1_params, layer2_params, reg = new_params
self.layer1.update_layer(layer1_params)
self.layer2.update_layer(layer2_params)
self.reg = reg
NOTE: Please do not change the code in the cell below, The cell below will run correctly if your code is right.
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
from utils.classifiers.twolayernet import TwoLayerNet
## Define a model
model = TwoLayerNet(input_dim=3072, hidden_dim=100, num_classes=10, reg=1e-4)
W1, b1 = model.layer1.params
W2, b2 = model.layer2.params
## Backprogation -- Finish loss function and gradients calculation in TwoLayerNet
loss = model.loss(X_dev, y_dev)
## Check loss by tensorflow
x_tf = tf.Variable(X_dev, dtype = tf.float32)
y_tf = tf.Variable(y_dev, dtype = tf.uint8)
W1_tf = tf.Variable(W1.astype('float32'))
b1_tf = tf.Variable(b1.astype('float32'))
W2_tf = tf.Variable(W2.astype('float32'))
b2_tf = tf.Variable(b2.astype('float32'))
h1_tf = tf.nn.relu(tf.matmul(x_tf, W1_tf) + b1_tf)
h2_tf = tf.matmul(h1_tf, W2_tf) + b2_tf
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= h2_tf, labels=tf.one_hot(y_tf,10))
L2_loss = tf.nn.l2_loss(W1_tf) + tf.nn.l2_loss(W2_tf)
loss_tf = tf.reduce_mean(cross_entropy) + 1e-4 * L2_loss
loss_check=loss_tf.numpy()
## Print validation result
print("Is loss correct? {}".format(np.allclose(loss, loss_check)))
Is loss correct? True
from utils.train_funcs import train, test
We have provide you the train( ) function in ./utils/train_func.py
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
from utils.classifiers.twolayernet import TwoLayerNet
model = TwoLayerNet(input_dim=3072, hidden_dim=100, num_classes=10, reg=1e-4, weight_scale=1e-3)
num_epoch = 10
batch_size = 500
lr = 5e-4
verbose = True
train_acc_hist, val_acc_hist = train(model, X_train, y_train, X_val, y_val,
num_epoch=num_epoch, batch_size=batch_size, learning_rate=lr, verbose=verbose)
test(model, X_test, y_test)
number of batches for training: 98 5000/49000 loss: 5.770709751686648 10000/49000 loss: 2.2987948994635685 15000/49000 loss: 2.2143693677840592 20000/49000 loss: 2.194754474468617 25000/49000 loss: 2.1247470252882787 30000/49000 loss: 2.1130675243680166 35000/49000 loss: 2.053547893320455 40000/49000 loss: 1.98253019754492 45000/49000 loss: 1.8852036075603205 epoch 1: valid acc = 0.294, new learning rate = 0.000475 5000/49000 loss: 1.9417299455742714 10000/49000 loss: 1.9506399121534186 15000/49000 loss: 1.8677240946612796 20000/49000 loss: 1.8910204748440222 25000/49000 loss: 1.755462877324926 30000/49000 loss: 1.8300118617216237 35000/49000 loss: 1.7803856257090727 40000/49000 loss: 1.756172358934598 45000/49000 loss: 1.8183637165515587 epoch 2: valid acc = 0.387, new learning rate = 0.00045125 5000/49000 loss: 1.7189770030441958 10000/49000 loss: 1.7109008619415367 15000/49000 loss: 1.7666187751068603 20000/49000 loss: 1.71865505278556 25000/49000 loss: 1.6642184183179176 30000/49000 loss: 1.6951101427521704 35000/49000 loss: 1.7082376083421198 40000/49000 loss: 1.6331673074458293 45000/49000 loss: 1.6937730405729627 epoch 3: valid acc = 0.405, new learning rate = 0.0004286875 5000/49000 loss: 1.6696872368969091 10000/49000 loss: 1.5916823345499134 15000/49000 loss: 1.620423352161367 20000/49000 loss: 1.6334327986528236 25000/49000 loss: 1.6450992705112075 30000/49000 loss: 1.5802404678368671 35000/49000 loss: 1.5776503059684088 40000/49000 loss: 1.61177670834835 45000/49000 loss: 1.5915863981097842 epoch 4: valid acc = 0.421, new learning rate = 0.00040725312499999993 5000/49000 loss: 1.52971667842817 10000/49000 loss: 1.5185957808644657 15000/49000 loss: 1.6103599753402524 20000/49000 loss: 1.4986585204617828 25000/49000 loss: 1.522178139077 30000/49000 loss: 1.630848779770945 35000/49000 loss: 1.578601515345025 40000/49000 loss: 1.5170029278568897 45000/49000 loss: 1.5301864155370501 epoch 5: valid acc = 0.432, new learning rate = 0.0003868904687499999 5000/49000 loss: 1.5792182932321417 10000/49000 loss: 1.5331690007458492 15000/49000 loss: 1.511264267422675 20000/49000 loss: 1.571510601947299 25000/49000 loss: 1.5077547003950331 30000/49000 loss: 1.5260764773967914 35000/49000 loss: 1.4789704636504488 40000/49000 loss: 1.4753460602156347 45000/49000 loss: 1.4877084552691453 epoch 6: valid acc = 0.45, new learning rate = 0.0003675459453124999 5000/49000 loss: 1.5403047656972666 10000/49000 loss: 1.4754420136383581 15000/49000 loss: 1.500364824311338 20000/49000 loss: 1.4534295217472444 25000/49000 loss: 1.4862971147588022 30000/49000 loss: 1.549220275083971 35000/49000 loss: 1.467283520227175 40000/49000 loss: 1.4708345288217592 45000/49000 loss: 1.4646358922435019 epoch 7: valid acc = 0.458, new learning rate = 0.00034916864804687486 5000/49000 loss: 1.445988045825306 10000/49000 loss: 1.445821701107501 15000/49000 loss: 1.4738857252700668 20000/49000 loss: 1.4319330872482934 25000/49000 loss: 1.4725972721378957 30000/49000 loss: 1.4204077364762653 35000/49000 loss: 1.524277463620054 40000/49000 loss: 1.4561389383783732 45000/49000 loss: 1.3765336741658947 epoch 8: valid acc = 0.473, new learning rate = 0.0003317102156445311 5000/49000 loss: 1.3841362260819758 10000/49000 loss: 1.4291171375615956 15000/49000 loss: 1.3567032662761862 20000/49000 loss: 1.5752362337694479 25000/49000 loss: 1.4303344652181817 30000/49000 loss: 1.3916627078292345 35000/49000 loss: 1.4026141836588497 40000/49000 loss: 1.4652863468762463 45000/49000 loss: 1.3812476863698009 epoch 9: valid acc = 0.476, new learning rate = 0.0003151247048623045 5000/49000 loss: 1.2924697759681287 10000/49000 loss: 1.4576734336570678 15000/49000 loss: 1.4413394463861953 20000/49000 loss: 1.4187580534166742 25000/49000 loss: 1.3917367844975386 30000/49000 loss: 1.4045043545037692 35000/49000 loss: 1.398220175022123 40000/49000 loss: 1.4006657733979062 45000/49000 loss: 1.3922858619584284 epoch 10: valid acc = 0.476, new learning rate = 0.00029936846961918924 test acc: 0.4859
0.4859
Plot training and validation accuracy history of each epoch</strong></p>
SOLUTION (enter a new cell below):
plt.plot(train_acc_hist)
plt.plot(val_acc_hist)
[<matplotlib.lines.Line2D at 0x13df74410>]
Visualization of the intermediate weights can help you get an intuitive understanding of how the network works, especially in Convolutional Neural Networks (CNNs).
from utils.display_funcs import visualize_pics
weights = model.layer1.params[0]
pics = weights.reshape(3, 32, 32, -1).transpose(3, 1, 2, 0)
## visualization
visualize_pics(pics)
num of feature vectors: 100
For this part, you need to train a better two-layer net. The requirement is to get test accuracy better than 50%. If your accuracy is lower, for each 1% lower than 50%, you will lose 5 points.
Here are some recommended methods for improving the performance. Feel free to try any other method as you see fit.
A comparison between SGD and SGD with momentum.
w = w - learning_rate * gradient
v = momentum*v + learning_rate * gradient
w = w - v
TODO
from utils.classifiers.twolayernet import TwoLayerNet
# Use previous layers to create a two layer neural network.
# Try several solutions and report the best performing one.
# input->(affine->activation)->(affine->softmax)->output
# The recommended activation function is ReLU. You can
# make a comparison with other activation functions to see
# the differences.
#
# You will need to execute code similar to below, using your parameter specs:
# model = TwoLayerNet(input_dim=TBD, hidden_dim=TBD, num_classes=TBD, reg=TBD, weight_scale=TBD)
# num_epoch = TBD
# batch_size = TBD
# lr = TBD
# verbose = TBD
# train_acc_hist, val_acc_hist = train(TBD)
# test(TBD, TBD, TBD)
SOLUTION (enter a new cell below):
# This one is with momentum.
model = TwoLayerNet(input_dim=3072, hidden_dim=256, num_classes=10, reg=1e-3, weight_scale=1e-2)
num_epoch = 50
batch_size = 256
lr = 1e-3
verbose = False
train_acc_hist, val_acc_hist = train(model, X_train, y_train, X_val, y_val, num_epoch=num_epoch, batch_size=batch_size, learning_rate=lr, verbose=verbose)
test(model, X_test, y_test)
number of batches for training: 191 epoch 1: valid acc = 0.18, new learning rate = 0.00095 epoch 2: valid acc = 0.302, new learning rate = 0.0009025 epoch 3: valid acc = 0.332, new learning rate = 0.000857375 epoch 4: valid acc = 0.381, new learning rate = 0.0008145062499999999 epoch 5: valid acc = 0.387, new learning rate = 0.0007737809374999998 epoch 6: valid acc = 0.429, new learning rate = 0.0007350918906249997 epoch 7: valid acc = 0.442, new learning rate = 0.0006983372960937497 epoch 8: valid acc = 0.455, new learning rate = 0.0006634204312890621 epoch 9: valid acc = 0.459, new learning rate = 0.000630249409724609 epoch 10: valid acc = 0.473, new learning rate = 0.0005987369392383785 epoch 11: valid acc = 0.475, new learning rate = 0.0005688000922764595 epoch 12: valid acc = 0.477, new learning rate = 0.0005403600876626365 epoch 13: valid acc = 0.489, new learning rate = 0.0005133420832795047 epoch 14: valid acc = 0.487, new learning rate = 0.00048767497911552944 epoch 15: valid acc = 0.495, new learning rate = 0.00046329123015975297 epoch 16: valid acc = 0.492, new learning rate = 0.0004401266686517653 epoch 17: valid acc = 0.516, new learning rate = 0.00041812033521917703 epoch 18: valid acc = 0.502, new learning rate = 0.00039721431845821814 epoch 19: valid acc = 0.513, new learning rate = 0.0003773536025353072 epoch 20: valid acc = 0.498, new learning rate = 0.0003584859224085418 epoch 21: valid acc = 0.511, new learning rate = 0.0003405616262881147 epoch 22: valid acc = 0.509, new learning rate = 0.00032353354497370894 epoch 23: valid acc = 0.509, new learning rate = 0.00030735686772502346 epoch 24: valid acc = 0.511, new learning rate = 0.00029198902433877225 epoch 25: valid acc = 0.519, new learning rate = 0.00027738957312183364 epoch 26: valid acc = 0.515, new learning rate = 0.0002635200944657419 epoch 27: valid acc = 0.51, new learning rate = 0.0002503440897424548 epoch 28: valid acc = 0.52, new learning rate = 0.00023782688525533205 epoch 29: valid acc = 0.524, new learning rate = 0.00022593554099256544 epoch 30: valid acc = 0.532, new learning rate = 0.00021463876394293716 epoch 31: valid acc = 0.522, new learning rate = 0.0002039068257457903 epoch 32: valid acc = 0.526, new learning rate = 0.00019371148445850077 epoch 33: valid acc = 0.514, new learning rate = 0.00018402591023557573 epoch 34: valid acc = 0.53, new learning rate = 0.00017482461472379692 epoch 35: valid acc = 0.511, new learning rate = 0.00016608338398760707 epoch 36: valid acc = 0.523, new learning rate = 0.0001577792147882267 epoch 37: valid acc = 0.53, new learning rate = 0.00014989025404881537 epoch 38: valid acc = 0.511, new learning rate = 0.00014239574134637458 epoch 39: valid acc = 0.526, new learning rate = 0.00013527595427905584 epoch 40: valid acc = 0.513, new learning rate = 0.00012851215656510304 epoch 41: valid acc = 0.508, new learning rate = 0.00012208654873684788 epoch 42: valid acc = 0.519, new learning rate = 0.00011598222130000548 epoch 43: valid acc = 0.518, new learning rate = 0.00011018311023500519 epoch 44: valid acc = 0.515, new learning rate = 0.00010467395472325493 epoch 45: valid acc = 0.508, new learning rate = 9.944025698709218e-05 epoch 46: valid acc = 0.506, new learning rate = 9.446824413773756e-05 epoch 47: valid acc = 0.502, new learning rate = 8.974483193085068e-05 epoch 48: valid acc = 0.521, new learning rate = 8.525759033430814e-05 epoch 49: valid acc = 0.51, new learning rate = 8.099471081759274e-05 epoch 50: valid acc = 0.513, new learning rate = 7.69449752767131e-05 test acc: 0.5114
0.5114
plt.plot(train_acc_hist)
plt.plot(val_acc_hist)
[<matplotlib.lines.Line2D at 0x13f211ad0>]
SOLUTION (enter a new cell below):
## Visualize weights of the first layer
W1 = model.layer1.params[0]
b1 = model.layer1.params[1]
print(W1.shape)
# Plot the W1
import matplotlib.cm as cm
plt.imshow(W1, cmap=cm.jet)
(3072, 256)
<matplotlib.image.AxesImage at 0x13e635fd0>
# See the weights for the first hidden layer.
W_first_hidden = W1[:,0]
print(W_first_hidden.shape)
reshape_W = W_first_hidden.reshape(3,32,32)
# Example: Show the operation on first channel
plt.imshow(reshape_W[0], cmap=cm.jet)
(3072,)
<matplotlib.image.AxesImage at 0x13d15a4d0>
## Create "save_model" folder if it does not exist
save_dir = "./save_models/"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
## Save your model
save_params = model.save_model()
with open("./save_models/best_model.pkl", "wb") as output_file:
pickle.dump(save_params, output_file)
## Load your model - FYI
#with open("./save_models/best_model.pkl", "rb") as input_file:
# load_params = pickle.load(input_file)
#model.update_model(load_params)