In this task, you are going to implement two classifers and apply them to the CIFAR-10 dataset:
(1) Logistic regression classifier
(2) Softmax classifier.
# Import modules, make sure you have installed all required packages before you start.
import tensorflow as tf
import os
import pickle
import numpy as np
import time
import matplotlib.pyplot as plt
from utils.cifar_utils import load_data
from random import shuffle
# Plot configurations
%matplotlib inline
# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2
from platform import python_version
print(python_version())
3.7.9
CIFAR-10 is a widely used dataset which contains 60,000 color images of size 32x32 divided into 10 classes, with 6,000 images per class. There are 50,000 training images and 10,000 test images. We are going to use them to create our training set, validation set and test set.
First, we load the raw CIFAR-10 data and create a 10-class dataset.
# Load the raw CIFAR-10 data. You can learn how to use CIFAR dataset by studing cifar_utils code,
# but you don't need to worry about that when doing this task.
X_train, y_train, X_test, y_test = load_data()
# We have vectorized the data (rearranged the storage of images) for you.
# That is, we flattened 1×32×32×3 images into 1×3072 Numpy arrays. Number 3 stands for 3 color channels.
# The reason we do this is because we can not put 3-D image representations into our model.
# This is common practice (flattening images before putting them into the ML model).
# Note that this practice may not be used for Convolutional Neural Networks (CNN).
# We will later see how we manage the data when used in CNNs in later assignments.
# Check the results
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
Training data shape: (50000, 3072) Training labels shape: (50000,) Test data shape: (10000, 3072) Test labels shape: (10000,)
# This part will walk you through how to visualize the CIFAR-10 training dataset.
# We first find names of 10 categories.
f = open('data/cifar-10-batches-py/batches.meta', 'rb')
namedict = pickle.load(f, encoding='latin1')
f.close()
category = namedict['label_names']
# We then reshape vectorized data into the image format
X = X_train.reshape(50000, 3, 32, 32).transpose(0,2,3,1)
print(category)
print(X.shape)
#Visualizing CIFAR 10 data. We randomly choose 25 images from the train dataset.
fig, axes1 = plt.subplots(5,5,figsize=(8,8))
for j in range(5):
for k in range(5):
i = np.random.choice(range(len(X)))
axes1[j][k].set_axis_off()
axes1[j][k].imshow(X[i:i+1][0])
axes1[j][k].set_title(category[y_train[i]])
['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] (50000, 32, 32, 3)
# Data organization:
# Train data: 49,000 samples from the original train set: indices 1~49,000
# Validation data: 1,000 samples from the original train set: indices 49,000~50,000
# Test data: 1,000 samples from the original test set: indices 1~1,000
# Development data (for gradient check): 100 random samples from the train set: indices 1~49,000
# Development data (binary) (only for gradient check in Part 1): 100 random samples from the subsampled binary train set
num_training = 49000
num_validation = 1000
num_test = 1000
num_dev = 100
num_dev_binary = 100
X_val = X_train[-num_validation:, :]
y_val = y_train[-num_validation:]
mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]
X_train = X_train[:num_training, :]
y_train = y_train[:num_training]
X_test = X_test[:num_test, :]
y_test = y_test[:num_test]
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('Development data shape:', X_dev.shape)
print('Development data shape', y_dev.shape)
Train data shape: (49000, 3072) Train labels shape: (49000,) Validation data shape: (1000, 3072) Validation labels shape: (1000,) Test data shape: (1000, 3072) Test labels shape: (1000,) Development data shape: (100, 3072) Development data shape (100,)
Next, in order to implement the experiment with the logistic regression classifier, we subsample the 10-class dataset to the 2-class dataset.
# Subsample 10-class training set to 2-class training set
X_train_binary = X_train[y_train<2,:]
num_training_binary = X_train_binary.shape[0]
y_train_binary = y_train[y_train<2]
mask_binary = np.random.choice(num_training_binary, num_dev_binary, replace=False)
X_val_binary = X_val[y_val<2,:]
y_val_binary = y_val[y_val<2]
X_dev_binary = X_train_binary[mask_binary]
y_dev_binary = y_train_binary[mask_binary]
print('Train data (binary) shape: ', X_train_binary.shape)
print('Train labels (binary) shape: ', y_train_binary.shape)
print('Validation data (binary) shape: ', X_val_binary.shape)
print('Validation labels (binary) shape: ', y_val_binary.shape)
print('Development data (binary) shape:', X_dev_binary.shape)
print('Development data (binary) shape', y_dev_binary.shape)
Train data (binary) shape: (9783, 3072) Train labels (binary) shape: (9783,) Validation data (binary) shape: (217, 3072) Validation labels (binary) shape: (217,) Development data (binary) shape: (100, 3072) Development data (binary) shape (100,)
# Preprocessing: subtract the mean value across every dimension, for training data
mean_image = np.mean(X_train, axis=0)
X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_val = X_val.astype(np.float32) - mean_image
X_test = X_test.astype(np.float32) - mean_image
X_dev = X_dev.astype(np.float32) - mean_image
# Append the bias dimension of ones (i.e. bias trick) so that our SVM
# only has to worry about optimizing a single weight matrix W.
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])
print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)
(49000, 3073) (1000, 3073) (1000, 3073) (100, 3073)
# Preprocessing: subtract the mean value across every dimension, for binary training data
mean_image = np.mean(X_train_binary, axis=0)
X_train_binary = X_train_binary.astype(np.float32) - mean_image.astype(np.float32)
X_val_binary = X_val_binary.astype(np.float32) - mean_image
X_dev_binary = X_dev_binary.astype(np.float32) - mean_image
# Append the bias dimension of ones (i.e. bias trick) so that our SVM
# only has to worry about optimizing a single weight matrix W.
X_train_binary = np.hstack([X_train_binary, np.ones((X_train_binary.shape[0], 1))])
X_val_binary = np.hstack([X_val_binary, np.ones((X_val_binary.shape[0], 1))])
X_dev_binary = np.hstack([X_dev_binary, np.ones((X_dev_binary.shape[0], 1))])
print(X_train_binary.shape, X_val_binary.shape, X_dev_binary.shape)
(9783, 3073) (217, 3073) (100, 3073)
In this part, you are going to implement a logistic regression classifier.
Let’s assume a training dataset of images $x_i \in R^D$, each associated with a label $y_i$. Here $i=1 \dots N$ and $y_i \in 1 \dots K$. That is, we have N examples (each with a dimensionality D) and K distinct categories.
We will now define the score function $f: R^D \to R^K$ that maps the raw image pixels to class scores: $$f(x_i; W, b)=W x_i + b$$ where $W$ is of size $K \times D$ and $b$ is of size $K \times 1$.
Here we will use bias trick to represent the two parameters $W,b$ as one by extending the vector $x_i$ with one additional dimension that always holds the constant 1 - a default bias dimension. With the extra dimension, the new score function will simplify to a single matrix multiply: $$f = f(x_i;W)=W x_i$$
Brief introduction to logistic regression classifier
Logistic regression classifier can solve a binary classification problem ($K=2$). A binary logistic regression classifier has only two classes (0,1), and calculates the probability of class 1 as:
$$ P(y=1 | x ; w)=\frac{1}{1+e^{-f}}=\sigma\left(f\right) $$Since the probabilities of class 1 and 0 sum to one, the probability for class 0 is:
$$ P(y=0 | x ; w)=1-P(y=1 | x ; w) $$Hence, an example is classified as a positive example ($y = 1$) if $\sigma\left(f\right)>0.5$, or equivalently if the score $f>0$. The loss function then maximizes the log likelihood of this probability. You can convince yourself that this simplifies to:
$$ L_{i}=-\sum_{j} (y_{i j} \log \left(\sigma\left(f_{j}\right)\right)+\left(1-y_{i j}\right) \log \left(1-\sigma\left(f_{j}\right)\right)) $$where the labels $y_{ij}$ are assumed to be either 1 (positive) or 0 (negative), and $\sigma(\cdot)$ is the sigmoid function. The expression above can look scary but the gradient on $f$ is in fact extremely simple and intuitive:
$$ \frac{\partial L_{i}}{\partial f}=-\sum_{j} (y_{i j}-\sigma\left(f_{j}\right)) $$$$ \frac{\partial L_{i}}{\partial W}= - \sum_{j} (y_{i j}-\sigma\left(f_{j}\right)) * x_{i} $$[1] http://cs231n.github.io/neural-networks-2/
[2] https://medium.com/@martinpella/logistic-regression-from-scratch-in-python-124c5636b8ac
You have to implement logistic regression in two ways: naive and vectorized. We provide the verification code for you to check if your code works properly.
def sigmoid(x):
h = np.zeros_like(x)
h = (1 / (1 + np.exp(-x) ) )
return h
def logistic_regression_loss_naive(W, X, y, reg):
"""
Logistic regression loss function, naive implementation (with loops)
Inputs have dimension D, there are C classes, and we operate on minibatches
of N examples.
Inputs:
- W: a numpy array of shape (D, C) containing weights.
- X: a numpy array of shape (N, D) containing a minibatch of data.
- y: a numpy array of shape (N,) containing training labels; y[i] = c means
that X[i] has label c, where c can be either 0 or 1.
- reg: (float) regularization strength
Returns a tuple of:
- loss: (float) the mean value of loss functions over N examples in minibatch.
- gradient: gradient wrt W, an array of same shape as W
"""
# Set the loss to a random number
loss = 0
# Initialize the gradient to zero
dW = np.zeros_like(W) #(3073, 2)
yexpand = np.zeros((len(y), 2))
yexpand[:,1] = y
yexpand[:,0] = 1 - yexpand[:,1]
dim, num_train = X.shape #(N=100, D=3073)
grad = np.zeros_like(W) # (D=3073, C=2)
f_x = np.zeros((dim,2))
for i in range(dim):
sample_x = X[i:i+1,:] #(1, D)
for idx in range(2):
f_x[i,idx] = (sample_x.dot((W[:,idx:idx+1]))).flatten()
h_x = 1.0 / (1 + np.exp(-f_x)) #the prob y=1 given x, W
loss += y[i] * np.log(h_x[i,1])+ (1 - y[i]) * np.log(1 - h_x[i,1])
dW[:,1:2] += np.transpose((h_x[i,1] - yexpand[i,1]) * sample_x) # [1, 3073] * [100,2]
dW[:,0:1] += np.transpose((h_x[i,0] - yexpand[i,0]) * sample_x) # [1, 3073] * [100,2]
loss = -loss
loss /= dim
loss += reg * np.sum(W * W) # add regularization
dW /= dim
dW += reg * W # add regularization
return loss, dW
def logistic_regression_loss_vectorized(W, X, y, reg):
"""
Logistic regression loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.
"""
# Set the loss to a random number
loss = 0
# Initialize the gradient to zero
dW = np.zeros_like(W)
Z = sigmoid(X @ W)
m = len(y)
epsilon = 0
loss = (1/m)*-(((y).T @ np.log(Z[:,1:2] + epsilon))+((1-y).T @ np.log(1-Z[:,1:2] + epsilon))) + reg * np.sum(W**2)
#one_hot_y = np.zeros((len(y),2))
#for idx, item in enumerate(y):
# one_hot_y[idx, item] = 1.
#y = one_hot_y
yexpand = np.zeros((len(y), 2)) # (100, 2)
yexpand[:,1] = y
yexpand[:,0] = 1 - yexpand[:,1]
y=yexpand
dW = (1/m) * (X.T @ (sigmoid(X @ W) - y)) + 2 * reg * W
return loss, dW
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
# Verification code for checking the correctness of the implementation of logistic_regression
# generate a random weight matrix seeded with small numbers
np.random.seed(3456)
W = np.random.randn(3073, 2) * 0.0001
## naive numpy implementation of Logistic Regression
loss_naive, grad_naive = logistic_regression_loss_naive(W, X_dev_binary, y_dev_binary, 0.000005)
print('naive numpy loss: {}.'.format(loss_naive))
## vectorized numpy implementation of Logistic Regression
loss_vec, grad_vec = logistic_regression_loss_vectorized(W, X_dev_binary, y_dev_binary, 0.000005)
print('vectorized numpy loss: {}.'.format(loss_vec))
## check the correctness
print('*'*100)
print('Relative loss error is {}'.format(abs(loss_vec-loss_naive)))
grad_err = np.linalg.norm(grad_naive - grad_vec, ord='fro')
print('Relative gradient error is {}'.format(grad_err))
print('*'*100)
print('Is vectorized loss correct? {}'.format(np.allclose(loss_naive, loss_vec)))
print('Is vectorized gradient correct? {}'.format(np.allclose(grad_naive, grad_vec)))
naive numpy loss: 0.7316005744370219. vectorized numpy loss: [0.73160057]. **************************************************************************************************** Relative loss error is [0.] Relative gradient error is 3.929830250970878e-08 **************************************************************************************************** Is vectorized loss correct? True Is vectorized gradient correct? True
Softmax classifier is a generalization of the Logistic Regression classifier to multiple classes.
In the Softmax classifier, the function mapping $f(x_i;W)=W x_i$ stays unchanged, but we now interpret the obtained scores as the unnormalized log probabilities for each class, and replace the hinge loss with a cross-entropy loss that has the form: $$L_i= - \log (\frac{e^{f_{y_i}}}{\sum_j e^{f_j}}).$$
The cross-entropy between a “true” distribution $p$ and an estimated distribution $q$ is defined as: $$H(p, q)=- \sum_x p(x) \log q(x).$$
Now, let's rewrite the expression of $L_i$: $$L_i= - \sum_k p_{i,k} \log (\frac{e^{f_k}}{\sum_j e^{f_j}})$$ where $p_i=[0, \dots,1, \dots, 0]$ contains a single 1 at the $y_i$-th position, $p_{i,k}=p_i[k]$, $p_i \in [1 \times K]$.
Note: Numerical stability. When you are writing code for computing the Softmax function in practice, the intermediate terms $e^{f_{y_i}}$ and $\sum_j e^{f_j}$ may be very large due to the exponentials. Dividing with large numbers can be numerically unstable, so it is important to use the normalization trick. Notice that if we multiply both the top and the bottom of the fraction by constant $C$ and push $C$ inside the exponent, we get the following (mathematically equivalent) expression: $$\frac{e^{f_{y_i}}}{\sum_j e^{f_j}}=\frac{Ce^{f_{y_i}}}{C\sum_j e^{f_j}}=\frac{e^{f_{y_i}+\log C}}{\sum_j e^{f_j+\log C}}.$$
A common choice for $C$ is to set it to $\log C= -\max_j f_j$.
In most cases, you also need to consider a bias term $b$ with length D. However, in this experiment, since a bias dimension has been added into the $X$, you can ignore it.
Softmax derivations (in matrix representation)
$$\nabla_{W_k} L= - \frac{1}{N} \sum_i x_i^T(p_{i,m} - P_m) + 2 \lambda W_k,$$where $P_k= \frac{e^{f_k}}{\sum_j e^{f_j}}$.
You have to implement the softmax layer in three ways. For the first two implementations, we provide the verification code for you to check if your implementation is correct.
Do not forget the $L_2$ regularization term in the loss.
W = np.random.randn(3073, 10) * 0.0001
dW = np.zeros_like(W)
X = tf.Variable(X_dev, dtype = tf.float32)
print(W.shape, dW.shape, X.shape)
(3073, 10) (3073, 10) (100, 3073)
def softmax_loss_naive(W, X, y, reg):
"""
Softmax loss function, naive implementation (with loops)
Inputs have dimension D, there are C classes, and we operate on minibatches
of N examples.
Inputs:
- W: a numpy array of shape (D, C) containing weights.
- X: a numpy array of shape (N, D) containing a minibatch of data.
- y: a numpy array of shape (N,) containing training labels; y[i] = c means
that X[i] has label c, where 0 <= c < C.
- reg: (float) regularization strength
Returns a tuple of:
- loss: (float) the mean value of loss functions over N examples in minibatch.
- gradient: gradient wrt W, an array of same shape as W
"""
# Initialize the loss and gradient to zero.
N, D = X.shape
C = W.shape[1]
loss = 0.0
out = np.zeros((N,C))
dW = np.zeros_like(W) # (3073, 10)
# forward
for i in range(N):
for j in range(C):
for k in range(D):
out[i, j] += X[i, k] * W[k, j]
out[i, :] = np.exp(out[i, :])
out[i, :] /= np.sum(out[i, :]) # (N, C)
# compute loss
loss -= np.sum(np.log(out[np.arange(N), y]))
loss /= N
loss += 0.5 * reg * np.sum(W**2)
# backward
out[np.arange(N), y] -= 1 # (N, C)
for i in range(N):
for j in range(D):
for k in range(C):
dW[j, k] += X[i, j] * out[i, k]
# add reg term
dW /= N
dW += reg * W
return loss, dW
def softmax_loss_vectorized(W, X, y, reg):
"""
Softmax loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
num_classes = W.shape[1]
num_train = X.shape[0]
scores = X.dot(W)
shift_scores = scores - np.max(scores, axis = 1).reshape(-1,1)
softmax_output = np.exp(shift_scores)/np.sum(np.exp(shift_scores), axis = 1).reshape(-1,1)
loss = -np.sum(np.log(softmax_output[range(num_train), list(y)]))
loss /= num_train
loss += 0.5* reg * np.sum(W * W)
dS = softmax_output.copy()
dS[range(num_train), list(y)] += -1
dW = (X.T).dot(dS)
dW = dW/num_train + reg* W
return loss, dW
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
# Verification code for checking the correctness of the implementation of softmax implementations
## generate a random weight matrix of small numbers
np.random.seed(3456)
W = np.random.randn(3073, 10) * 0.0001
W_tf = tf.Variable(W, dtype = tf.float32)
X = tf.Variable(X_dev, dtype = tf.float32)
y = tf.Variable(y_dev, dtype = tf.int32)
reg = tf.constant(0.000005)
with tf.GradientTape() as tape:
tape.watch(W_tf)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= tf.matmul(X, W_tf), labels=tf.one_hot(y,10))
loss_gt = tf.reduce_mean(cross_entropy) + reg * tf.reduce_sum(W_tf * W_tf)
grad_gt = tape.gradient(loss_gt, W_tf)
## naive softmax in numpy
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('naive numpy loss: {}, takes {} seconds.'.format(loss_naive, toc-tic))
## vectorized softmax in numpy
tic = time.time()
loss_vec, grad_vec = softmax_loss_vectorized(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized numpy loss: {}, takes {} seconds.'.format(loss_vec, toc-tic))
## Verify your result here - use 'rel_err' for error evaluation.
def rel_err(a,b):
return np.mean(abs(a-b))
print('*'*100)
print('Relative loss error of naive softmax is {}'.format(rel_err(loss_gt,loss_naive)))
print('Relative loss error of vectorized softmax is {}'.format(rel_err(loss_gt,loss_vec)))
print('Gradient error of naive softmax is {}'.format(rel_err(grad_gt,grad_naive)))
print('Gradient error of vectorized softmax is {}'.format(rel_err(grad_gt,grad_vec)))
print('*'*100)
print('Is naive softmax loss correct? {}'.format(np.allclose(loss_gt, loss_naive)))
print('Is vectorized softmax loss correct? {}'.format(np.allclose(loss_gt, loss_vec)))
print('Is naive softmax grad correct? {}'.format(np.allclose(grad_gt, grad_naive,1e-02)))
print('Is vectorized softmax grad correct? {}'.format(np.allclose(grad_gt, grad_vec,1e-02)))
naive numpy loss: 2.4370312422382856, takes 4.241668939590454 seconds. vectorized numpy loss: 2.4370312422382856, takes 0.009987831115722656 seconds. **************************************************************************************************** Relative loss error of naive softmax is 0.0 Relative loss error of vectorized softmax is 0.0 Gradient error of naive softmax is 3.5056891078966146e-07 Gradient error of vectorized softmax is 3.5056891078966146e-07 **************************************************************************************************** Is naive softmax loss correct? True Is vectorized softmax loss correct? True Is naive softmax grad correct? True Is vectorized softmax grad correct? True
Now you can start to train your classifiers. We are going to use gradient descent algorithm for training, which differs from the usual logistic regression training process.
In the training section, you are asked to implement Stochastic gradient descent (SGD) optimization method. Pseudo code for SGD is shown below.
w = w - learning_rate * gradient
class BasicClassifier(object):
def __init__(self):
self.W = None
self.velocity = None
def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
batch_size=200, optim='SGD', momentum=0.5, verbose=False):
"""
Train this linear classifier using stochastic gradient descent(SGD).
Inputs:
- X: a numpy array of shape (N, D) containing training data; there are N
training samples each of dimension D.
- y: a numpy array of shape (N,) containing training labels; y[i] = c
means that X[i] has label 0 <= c < C for C classes.
- learning_rate: (float) learning rate for optimization.
- reg: (float) L2 regularization strength.
- num_iters: (integer) number of steps to take when optimizing
- batch_size: (integer) number of training examples to use at each step.
- optim: the optimization method, the default optimizer is 'SGD' and
feel free to add other optimizers.
- verbose: (boolean) if true, print progress during optimization.
Returns:
- loss_history: a list containing the value of the loss function of each iteration.
"""
num_train, dim = X.shape
num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
# Initialize W and velocity(for SGD with momentum)
if self.W is None:
self.W = 0.001 * np.random.randn(dim, num_classes)
if self.velocity is None:
self.velocity = np.zeros_like(self.W)
# Run stochastic gradient descent to optimize W
loss_history = []
for it in range(num_iters):
#########################################################################
# Sample batch_size elements from the training data and their #
# corresponding labels to use in this round of gradient descent. #
# Store the data in X_batch and their corresponding labels in #
# y_batch; after sampling X_batch should have shape (batch_size, dim) #
# and y_batch should have shape (batch_size,) #
# #
# Hint: Use np.random.choice to generate indices. Sometimes, random #
# choice will be better than training in order. #
random_idx = np.random.choice(len(y), batch_size) #generate indices
X_batch = X[random_idx, :]
y_batch = y[random_idx]
# Update the weights using the gradient and the learning rate. #
# Evaluate the loss and gradient
loss, dW = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)
# Then update the parameters
self.velocity = momentum * self.velocity + learning_rate * dW
self.W = self.W - self.velocity
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
return loss_history
def predict(self, X):
"""
Use the trained weights of this linear classifier to predict labels for
data points.
Inputs:
- X: a numpy array of shape (N, D) containing training data; there are N
training samples each of dimension D.
Returns:
- y_pred: predicted labels for the data in X. y_pred is a 1-dimensional
array of length N, and each element is an integer giving the predicted
class.
"""
#Store the predicted labels in y_pred.
y_pred = np.matmul(X, self.W).argmax(axis = 1)
return y_pred
def loss(self, X_batch, y_batch, reg):
"""
Compute the loss function and its derivative.
Subclasses will override this.
Inputs:
- X_batch: a numpy array of shape (N, D) containing a minibatch of N
data points; each point has dimension D.
- y_batch: a numpy array of shape (N,) containing labels for the minibatch.
- reg: (float) regularization strength.
Returns:
- loss: a single float
- gradient: gradients wst W, an array of the same shape as W
"""
pass
class Logistic_Regression(BasicClassifier):
""" A subclass that uses the Logistic Regression loss function """
def loss(self, X_batch, y_batch, reg):
return logistic_regression_loss_vectorized(self.W, X_batch, y_batch, reg)
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
## Logistic Regression + SGD
classifier = Logistic_Regression()
reg = 1e-5 # regularization
lr = 1e-7 # learning rate
loss_hist_sgd = classifier.train(X=X_train_binary, y=y_train_binary, learning_rate=lr, reg=reg, num_iters=1500, optim='SGD', verbose=True)
# Write the BasicClassifier.predict function and evaluate the performance on both
# training set and validation set
y_train_pred = classifier.predict(X_train_binary)
print('training accuracy: %f' % (np.mean(y_train_binary == y_train_pred), ))
y_val_pred = classifier.predict(X_val_binary)
print('validation accuracy: %f' % (np.mean(y_val_binary == y_val_pred), ))
iteration 0 / 1500: loss 0.858037 iteration 100 / 1500: loss 0.765486 iteration 200 / 1500: loss 0.599353 iteration 300 / 1500: loss 0.567630 iteration 400 / 1500: loss 0.588487 iteration 500 / 1500: loss 0.579063 iteration 600 / 1500: loss 0.590700 iteration 700 / 1500: loss 0.492008 iteration 800 / 1500: loss 0.566183 iteration 900 / 1500: loss 0.551010 iteration 1000 / 1500: loss 0.529384 iteration 1100 / 1500: loss 0.480550 iteration 1200 / 1500: loss 0.488102 iteration 1300 / 1500: loss 0.429249 iteration 1400 / 1500: loss 0.467174 training accuracy: 0.805377 validation accuracy: 0.764977
## SGD error plot
plt.plot(loss_hist_sgd, label='SGD')
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.legend()
plt.show()
class Softmax(BasicClassifier):
""" A subclass that uses the Softmax + Cross-entropy loss function """
def loss(self, X_batch, y_batch, reg):
return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
# THE FOLLOWING CODE IS JUST FOR CHECKING. #
# NO NEED TO CHANGE IT. #
## Softmax + SGD
classifier = Softmax()
reg = 1e-5 # regularization
lr = 1e-7 # learning rate
loss_hist_sgd = classifier.train(X=X_train, y=y_train, learning_rate=lr, reg=reg, num_iters=1500, optim='SGD', verbose=True)
# Write the BasicClassifier.predict function and evaluate the performance on both the
# training and validation set
y_train_pred = classifier.predict(X_train)
print('training accuracy: %f' % (np.mean(y_train == y_train_pred), ))
y_val_pred = classifier.predict(X_val)
print('validation accuracy: %f' % (np.mean(y_val == y_val_pred), ))
iteration 0 / 1500: loss 5.224113 iteration 100 / 1500: loss 3.644922 iteration 200 / 1500: loss 3.229025 iteration 300 / 1500: loss 3.063280 iteration 400 / 1500: loss 2.905337 iteration 500 / 1500: loss 2.907425 iteration 600 / 1500: loss 2.643449 iteration 700 / 1500: loss 2.817604 iteration 800 / 1500: loss 2.745169 iteration 900 / 1500: loss 2.591534 iteration 1000 / 1500: loss 2.500669 iteration 1100 / 1500: loss 2.439365 iteration 1200 / 1500: loss 2.446261 iteration 1300 / 1500: loss 2.436632 iteration 1400 / 1500: loss 2.274754 training accuracy: 0.282327 validation accuracy: 0.293000
## SGD loss curve
plt.plot(loss_hist_sgd, label='SGD')
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.legend()
plt.show()