Columbia University

ECBM E4040 Neural Networks and Deep Learning

Yi-Pei Chan

Task 3: Multilayer Perceptron (MLP)

This is the third part of the assignment. You will get to implement MLP using tensorflow

In [15]:
# Import modules
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time
import os
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')

from utils.cifar_utils import load_data

# Plot configurations
%matplotlib inline

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load Data

In [16]:
# Load the raw CIFAR-10 data.
X_train, y_train, X_test, y_test = load_data()

# Data organizations:
# Train data: 49000 samples from original train set: 1~49,000
# Validation data: 1000 samples from original train set: 49,000~50,000
# Test data: 10000 samples from original test set: 1~10,000
# Development data (for gradient check): 100 from the train set: 1~49,000
num_training = 49000
num_validation = 1000
num_dev = 100

X_val = X_train[-num_validation:, :]
y_val = y_train[-num_validation:]

mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]

X_train = X_train[:num_training, :]
y_train = y_train[:num_training]

# Preprocessing: subtract the mean value across every dimension for training data
mean_image = np.mean(X_train, axis=0)

X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_val = X_val.astype(np.float32) - mean_image
X_test = X_test.astype(np.float32) - mean_image
X_dev = X_dev.astype(np.float32) - mean_image

print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('Development data shape:', X_dev.shape)
print('Development data shape', y_dev.shape)
(49000, 3072) (1000, 3072) (10000, 3072) (100, 3072)
Train data shape:  (49000, 3072)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3072)
Validation labels shape:  (1000,)
Test data shape:  (10000, 3072)
Test labels shape:  (10000,)
Development data shape: (100, 3072)
Development data shape (100,)

Part 1: Tensorflow MLP

In this part, you will use tensorflow modules to implement a MLP. We provide a demo of a two-layer net, of which style is referred to https://www.tensorflow.org/guide/keras, and https://www.tensorflow.org/guide/eager.

You need to implement a multi-layer with 3 layers in a similar style.

Demo: Two-layer MLP in Tensorflow

In [17]:
## Demo: Two-layer net in tensorflow (eager execution mode)
hidden_dim = 100
reg_tf = tf.constant(0.01)

# define a tf.keras.Model class
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.W1 = tf.Variable(1e-2*np.random.rand(3072, hidden_dim).astype('float32'))
        self.b1 = tf.Variable(np.zeros((hidden_dim,)).astype('float32'))
        self.W2 = tf.Variable(1e-2*np.random.rand(hidden_dim, 10).astype('float32'))
        self.b2 = tf.Variable(np.zeros((10,)).astype('float32'))
    def call(self, inputs):
        """Run the model."""
        h1 = tf.nn.relu(tf.matmul(inputs, self.W1) + self.b1)
        out = tf.matmul(h1, self.W2) + self.b2
        return out

# Define and calculate loss function (Note that in eager execution, loss must be in a function)
def loss(model, inputs, targets, reg = tf.constant(0.01)):
    out = model(inputs)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= out, labels=tf.one_hot(targets,10))
    L2_loss = tf.nn.l2_loss(model.W1) + tf.nn.l2_loss(model.W2)
    return tf.reduce_mean(cross_entropy) + reg * L2_loss

# calculate gradients for all variables using tf.GradientTape
def grad(model, inputs, targets, reg = tf.constant(0.01)):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, reg=reg)
    return tape.gradient(loss_value, [model.W1, model.b1, model.W2, model.b2])

# calculate classification accuracy
def eval_acc(model, inputs, targets):
    correct_prediction = tf.equal(targets, tf.cast(tf.argmax(model(inputs),1), tf.uint8))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

num_train = 49000
batch_size = 500
num_batch = num_train//batch_size
num_epochs = 10
model = Model()
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)

for e in range(num_epochs):
    for i in range(num_batch):
        batch_xs, batch_ys = X_train[i*batch_size:(i+1)*batch_size], y_train[i*batch_size:(i+1)*batch_size]
        x_tf = tf.Variable(batch_xs, dtype = tf.float32)
        y_tf = tf.Variable(batch_ys, dtype = tf.uint8)
        
        grads = grad(model, x_tf, y_tf, reg_tf)
        #optimization based on calculated gradients 
        optimizer.apply_gradients(zip(grads, [model.W1, model.b1, model.W2, model.b2]))

    x_tf = tf.Variable(X_val, dtype = tf.float32)
    y_tf = tf.Variable(y_val, dtype = tf.uint8)
    accuracy = eval_acc(model, x_tf, y_tf)
    val_acc = accuracy.numpy()
    print('epoch {}: valid acc = {}'.format(e+1, val_acc))

x_tf = tf.Variable(X_test, dtype = tf.float32)
y_tf = tf.Variable(y_test, dtype = tf.uint8)
accuracy = eval_acc(model, x_tf, y_tf)
test_acc = accuracy.numpy()
print('test acc = {}'.format(test_acc))
epoch 1: valid acc = 0.19300000369548798
epoch 2: valid acc = 0.25
epoch 3: valid acc = 0.2879999876022339
epoch 4: valid acc = 0.32499998807907104
epoch 5: valid acc = 0.3199999928474426
epoch 6: valid acc = 0.36000001430511475
epoch 7: valid acc = 0.36800000071525574
epoch 8: valid acc = 0.3869999945163727
epoch 9: valid acc = 0.40700000524520874
epoch 10: valid acc = 0.4169999957084656
test acc = 0.4205000102519989

Create Deeper Network

TODO:

Create your MLP in tensorflow. Since you are going to create a deeper neural network, it is recommended to use "list" to store your network parameters (weights and bias) and consider to use a loop to create your MLP network. Hint: Copy above code and make necessary changes in model definition

SOLUTION (enter a new cell below):

In [18]:
hidden_dim_1 = 512
hidden_dim_2 = 512

reg_tf = tf.constant(0.01)

# define a tf.keras.Model class
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.W1 = tf.Variable(1e-2*np.random.rand(3072, hidden_dim_1).astype('float32'))
        self.b1 = tf.Variable(np.zeros((hidden_dim_1,)).astype('float32'))
        self.W2 = tf.Variable(1e-2*np.random.rand(hidden_dim_1, hidden_dim_2).astype('float32'))
        self.b2 = tf.Variable(np.zeros((hidden_dim_2,)).astype('float32'))
        self.W3 = tf.Variable(1e-2*np.random.rand(hidden_dim_2, 10).astype('float32'))
        self.b3 = tf.Variable(np.zeros((10,)).astype('float32'))
    def call(self, inputs):
        """Run the model."""
        h1 = tf.nn.sigmoid(tf.matmul(inputs, self.W1) + self.b1)
        h2 = tf.nn.relu(tf.matmul(h1, self.W2) + self.b2)
        out = tf.matmul(h2, self.W3) + self.b3
        return out

# Define and calculate loss function (Note that in eager execution, loss must be in a function)
def loss(model, inputs, targets, reg = tf.constant(0.01)):
    out = model(inputs)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= out, labels=tf.one_hot(targets,10))
    L2_loss = tf.nn.l2_loss(model.W1) + tf.nn.l2_loss(model.W2) + tf.nn.l2_loss(model.W3)
    return tf.reduce_mean(cross_entropy) + reg * L2_loss

# calculate gradients for all variables using tf.GradientTape
def grad(model, inputs, targets, reg = tf.constant(0.001)):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, reg=reg)
    return tape.gradient(loss_value, [model.W1, model.b1, model.W2, model.b2, model.W3, model.b3])

# calculate classification accuracy
def eval_acc(model, inputs, targets):
    correct_prediction = tf.equal(targets, tf.cast(tf.argmax(model(inputs),1), tf.uint8))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

num_train = 49000
batch_size = 500
num_batch = num_train//batch_size
num_epochs = 10
model = Model()
# optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)

for e in range(num_epochs):
    for i in range(num_batch):
        batch_xs, batch_ys = X_train[i*batch_size:(i+1)*batch_size], y_train[i*batch_size:(i+1)*batch_size]
        x_tf = tf.Variable(batch_xs, dtype = tf.float32)
        y_tf = tf.Variable(batch_ys, dtype = tf.uint8)
        
        grads = grad(model, x_tf, y_tf, reg_tf)
        #optimization based on calculated gradients 
        optimizer.apply_gradients(zip(grads, [model.W1, model.b1, model.W2, model.b2, model.W3, model.b3]))

    x_tf = tf.Variable(X_val, dtype = tf.float32)
    y_tf = tf.Variable(y_val, dtype = tf.uint8)
    accuracy = eval_acc(model, x_tf, y_tf)
    val_acc = accuracy.numpy()
    print('epoch {}: valid acc = {}'.format(e+1, val_acc))

x_tf = tf.Variable(X_test, dtype = tf.float32)
y_tf = tf.Variable(y_test, dtype = tf.uint8)
accuracy = eval_acc(model, x_tf, y_tf)
test_acc = accuracy.numpy()
print('test acc = {}'.format(test_acc))
epoch 1: valid acc = 0.3019999861717224
epoch 2: valid acc = 0.3499999940395355
epoch 3: valid acc = 0.34599998593330383
epoch 4: valid acc = 0.35100001096725464
epoch 5: valid acc = 0.3569999933242798
epoch 6: valid acc = 0.3479999899864197
epoch 7: valid acc = 0.36500000953674316
epoch 8: valid acc = 0.33799999952316284
epoch 9: valid acc = 0.3580000102519989
epoch 10: valid acc = 0.35899999737739563
test acc = 0.37059998512268066

Part 2: t-SNE (optional, bonus +10 points)

t-SNE is is a machine learning algorithm for nonlinear dimensionality reduction developed by Geoffrey Hinton and Laurens van der Maaten. It is also a good way of visualizing high-dimensional data in 2D. We show its application for CIFAR10. Later it will be re-used in a CNN network. Experimenting with t-SNE can be fun. One thing to try is to visualize the output of each layer of MLP to observe the differences.

[1] Maaten, Laurens van der, and Geoffrey Hinton. "Visualizing data using t-SNE." Journal of Machine Learning Research 9.Nov (2008): 2579-2605.

[2] Adaptive learning rate scheme by Jacobs https://www.willamette.edu/~gorr/classes/cs449/Momentum/deltabardelta.html

[3] http://cs.stanford.edu/people/karpathy/cnnembed/

[4] How to Use t-SNE Effectively, with examples. https://distill.pub/2016/misread-tsne

In [19]:
from utils.train_funcs import train, test
from utils.classifiers.mlp import MLP
from utils.features.tsne import tsne
In [20]:
# Load the raw CIFAR-10 data.
X_train, y_train, X_test, y_test = load_data()
X_train = X_train.reshape([50000,3,32,32]).transpose((0,2,3,1))
X_test = X_test.reshape([10000,3,32,32]).transpose((0,2,3,1))

# Data organizations:
# Train data: 49000 samples from original train set: 1~49000
# Validation data: 1000 samples from original train set: 49000~50000
# Test data: 10000 samples from original test set: 1~10000
# Development data (for gradient check): 100 from the train set: 1~49000 #TODOTA is this 100 or 1000?
num_training = 49000
num_validation = 1000
num_dev = 100

X_val = X_train[-num_validation:]
y_val = y_train[-num_validation:]

mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]

X_train = X_train[:num_training]
y_train = y_train[:num_training]

# Preprocessing: subtract the mean value across every dimension for training data
mean_image = np.mean(X_train, axis=0)

X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_val = X_val.astype(np.float32) - mean_image.astype(np.float32)
X_test = X_test.astype(np.float32) - mean_image.astype(np.float32)
X_dev = X_dev.astype(np.float32) - mean_image.astype(np.float32)

print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('Development data shape:', X_dev.shape)
print('Development labels shape', y_dev.shape)
Train data shape:  (49000, 32, 32, 3)
Train labels shape:  (49000,)
Validation data shape:  (1000, 32, 32, 3)
Validation labels shape:  (1000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)
Development data shape: (100, 32, 32, 3)
Development labels shape (100,)

tSNE of original data

In [21]:
random_select = np.random.choice(10000, 500, replace=False)
X = X_test[random_select,:,:,0].reshape(500,1024).astype('float')/255.0
tic = time.time()
Y = tsne(X, low_dim=2, perplexity=30.0)
print("it takes {} seconds".format(time.time()-tic))
2.4685759314954216
The 100 th loop cost: 13.61064902067933, computation time: 1.7419660091400146
The 200 th loop cost: 1.145007186069041, computation time: 3.9461472034454346
The 300 th loop cost: 1.1405191870984794, computation time: 6.122726917266846
The 400 th loop cost: 1.140458939930807, computation time: 8.088608026504517
The 500 th loop cost: 1.140458668210997, computation time: 10.895421981811523
The 600 th loop cost: 1.140458667765565, computation time: 12.29001498222351
The 700 th loop cost: 1.1404586677647504, computation time: 13.667845964431763
The 800 th loop cost: 1.1404586677647457, computation time: 15.010760068893433
The 900 th loop cost: 1.140458667764746, computation time: 16.314085960388184
The 1000 th loop cost: 1.140458667764746, computation time: 17.630610942840576
it takes 18.160896062850952 seconds
In [22]:
## visualize tSNE of original data
labels = y_test[random_select]
colors = np.random.rand(10,3)
color_labels = [colors[int(i)] for i in labels.tolist()]
plt.scatter(Y[:,0], Y[:,1], 20, color_labels)
plt.show()

tSNE of data after two hidden layers

Do visualization of the tSNE of data after going through MLP. In the visualization result, you should find that in comparison with the tSNE of original data where all data points mess up with each other, tSNE of data after two-layer networks would be shown as multiple clusters in a 2D panel.

In [23]:
# Define MLP model
model = MLP(input_dim=3072, hidden_dims=[100], num_classes=10, reg=0.1, weight_scale=1e-3)

num_epoch = 10
batch_size = 200
lr = 1e-3
verbose = False
train_acc_hist, val_acc_hist = train(model, X_train, y_train, X_val, y_val, 
                  num_epoch=num_epoch, batch_size=batch_size, learning_rate=lr, verbose=verbose)
test(model, X_test, y_test)
number of batches for training: 245
epoch 1: valid acc = 0.413, new learning rate = 0.00095
epoch 2: valid acc = 0.442, new learning rate = 0.0009025
epoch 3: valid acc = 0.448, new learning rate = 0.000857375
epoch 4: valid acc = 0.482, new learning rate = 0.0008145062499999999
epoch 5: valid acc = 0.493, new learning rate = 0.0007737809374999998
epoch 6: valid acc = 0.493, new learning rate = 0.0007350918906249997
epoch 7: valid acc = 0.498, new learning rate = 0.0006983372960937497
epoch 8: valid acc = 0.499, new learning rate = 0.0006634204312890621
epoch 9: valid acc = 0.49, new learning rate = 0.000630249409724609
epoch 10: valid acc = 0.478, new learning rate = 0.0005987369392383785
test acc: 0.4987
Out[23]:
0.4987

TODO: Visualize data that is passed through MLP model defined above using tSNE.

In [24]:
# run tSNE
X = X_test[random_select]
tic = time.time()

# Hint: Pass data through affine and dense layers (model.layers) and then 
# apply softmax to obtain output of the MLP model.
#############################################################################
from utils.layer_funcs import softmax
in_x = model.layers[0].feedforward(X)
in_x = model.layers[1].feedforward(in_x)
in_x = softmax(in_x)


print("it takes {} seconds".format(time.time()-tic))
it takes 0.0317540168762207 seconds
In [25]:
# visualize tSNE 2D representation of data after two hidden layers
# Hint: See tSNE visualization of original data
#############################################################################
tic = time.time()
Y = tsne(in_x, low_dim=2, perplexity=30.0)
print("it takes {} seconds".format(time.time()-tic))

labels = y_test[random_select]
colors = np.random.rand(10,3)
color_labels = [colors[int(i)] for i in labels.tolist()]
plt.scatter(Y[:,0], Y[:,1], 20, color_labels)
plt.show()
0.2067841526114124
The 100 th loop cost: 11.050253847984472, computation time: 1.6079530715942383
The 200 th loop cost: 0.48779429203338154, computation time: 3.453407049179077
The 300 th loop cost: 0.4750465482681813, computation time: 5.705897092819214
The 400 th loop cost: 0.47427938769925476, computation time: 7.3215179443359375
The 500 th loop cost: 0.47424053856812864, computation time: 8.907883167266846
The 600 th loop cost: 0.47423793467872805, computation time: 10.60624384880066
The 700 th loop cost: 0.4742377453669589, computation time: 12.36299204826355
The 800 th loop cost: 0.4742377307227033, computation time: 14.008923053741455
The 900 th loop cost: 0.4742377295389228, computation time: 15.694175004959106
The 1000 th loop cost: 0.47423772944385445, computation time: 17.119104146957397
it takes 17.939522981643677 seconds

TODO: Try tuning the parameters of tSNE, do visualization of the new tSNE of data.

In [26]:
# Tune the parameter, show the results.
# run tSNE
X = X_test[random_select]

tic = time.time()
Y = tsne(in_x, low_dim=2, perplexity=40.0)

print("it takes {} seconds".format(time.time()-tic))
0.23963158250823188
The 100 th loop cost: 10.798033908076375, computation time: 1.81003999710083
The 200 th loop cost: 0.43828516720113303, computation time: 3.079040050506592
The 300 th loop cost: 0.4325856510397792, computation time: 4.370862007141113
The 400 th loop cost: 0.43247786654593967, computation time: 5.700303077697754
The 500 th loop cost: 0.43226414066857477, computation time: 6.98406195640564
The 600 th loop cost: 0.4322585632417634, computation time: 8.27538514137268
The 700 th loop cost: 0.432258429753933, computation time: 9.53980302810669
The 800 th loop cost: 0.4322584270128269, computation time: 10.823467016220093
The 900 th loop cost: 0.43225842696507855, computation time: 12.074893951416016
The 1000 th loop cost: 0.43225842696421646, computation time: 13.381053924560547
it takes 13.963051080703735 seconds
In [27]:
# visualize tSNE 2D representation of data after two hidden layers

labels = y_test[random_select]
colors = np.random.rand(10,3)
color_labels = [colors[int(i)] for i in labels.tolist()]
plt.scatter(Y[:,0], Y[:,1], 20, color_labels)
plt.show()
In [ ]: