代码实现题code
daseCV/classifiers/k_nearest_neighbor.py
from builtins import range
from builtins import object
import numpy as np
class KNearestNeighbor(object):
""" a kNN classifier with L2 distance """
def __init__(self):
pass
def train(self, X, y):
"""
Train the classifier. For k-nearest neighbors this is just
memorizing the training data.
Inputs:
- X: A numpy array of shape (num_train, D) containing the training data
consisting of num_train samples each of dimension D.
- y: A numpy array of shape (N,) containing the training labels, where
y[i] is the label for X[i].
"""
self.X_train = X
self.y_train = y
def predict(self, X, k=1, num_loops=0):
"""
Predict labels for test data using this classifier.
Inputs:
- X: A numpy array of shape (num_test, D) containing test data consisting
of num_test samples each of dimension D.
- k: The number of nearest neighbors that vote for the predicted labels.
- num_loops: Determines which implementation to use to compute distances
between training points and testing points.
Returns:
- y: A numpy array of shape (num_test,) containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
if num_loops == 0:
dists = self.compute_distances_no_loops(X)
elif num_loops == 1:
dists = self.compute_distances_one_loop(X)
elif num_loops == 2:
dists = self.compute_distances_two_loops(X)
else:
raise ValueError('Invalid value %d for num_loops' % num_loops)
return self.predict_labels(dists, k=k)
def compute_distances_two_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a nested loop over both the training data and the
test data.
Inputs:
- X: A numpy array of shape (num_test, D) containing test data.
Returns:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
is the Euclidean distance between the ith test point and the jth training
point.
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
for j in range(num_train):
#####################################################################
# TODO:
#计算第i个测试点与第j个训练点之间的l2距离,并将结果存储在dists[i,j]中。
#你不应使用循环和np.linalg.norm()函数。
#####################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
dists[i][j] = np.sqrt((X[i]-self.X_train[j]).dot(X[i]-self.X_train[j]))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dists
def compute_distances_one_loop(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a single loop over the test data.
Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
#######################################################################
# TODO:
#计算第i个测试点与所有训练点之间的l2距离,并将结果存储在dists[i,:]中。
#不要使用np.linalg.norm()。
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 注意np.sum中要加上维度axis=1才能得出正确的结果
# 关于axis的介绍
# https://zhuanlan.zhihu.com/p/30960190
# 以及np.sum的介绍
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html
# self.X_train (5000,3072) X[i] (1,3072) (self.X_train - X[i]) (5000,3072)
dists[i] = np.sqrt(np.sum(np.square(self.X_train - X[i]), axis=1))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dists
def compute_distances_no_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using no explicit loops.
Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
#########################################################################
# TODO:
#在不使用任何显式循环的情况下,计算所有测试点和所有训练点之间的l2距离,
#并将结果存储在dists中。
#您应该仅使用基本的数组操作来实现此功能。
#不可以使用scipy中的函数以及函数np.linalg.norm()。
#
#提示:尝试使用矩阵乘法和广播总和来计算l2距离。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# (x-y)^2 = x^2 + y^2 - 2xy
# reshape是为了让两个矩阵有个维度为1,这样子便可进行广播
dists = np.sqrt(np.sum(np.square(X), axis=1).reshape(num_test, -1) + np.sum(np.square(self.X_train), axis=1).reshape(1, -1) - 2 * X.dot(self.X_train.T))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dists
def predict_labels(self, dists, k=1):
"""
Given a matrix of distances between test points and training points,
predict a label for each test point.
Inputs:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
gives the distance betwen the ith test point and the jth training point.
Returns:
- y: A numpy array of shape (num_test,) containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
num_test = dists.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
# A list of length k storing the labels of the k nearest neighbors to
# the ith test point.
closest_y = []
#########################################################################
# TODO:
#使用距离矩阵查找第i个测试点的k个最近邻居,
#并使用self.y_train查找这些邻居的标签。
#将这些标签存储在closest_y中。
#
#提示:查阅函数numpy.argsort。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# numpy.argsort 返回排序好的数列的索引
idxs = np.argsort(dists[i])[:k]
closest_y = self.y_train[idxs]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#########################################################################
# TODO:
#
#现在,你已经找到了k个最近邻的标签,接着需要在closest_y中找到最可能的标签。 #将此标签存储在y_pred [i]中。如果有两个标签可能性一样的话选择索引更小的那个。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
y_pred[i] = np.bincount(closest_y).argmax()
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return y_pred
daseCV/classifiers/linear_classifier.py
from __future__ import print_function
from builtins import range
from builtins import object
import numpy as np
from daseCV.classifiers.linear_svm import *
from daseCV.classifiers.softmax import *
# from past.builtins import xrange
class LinearClassifier(object):
def __init__(self):
self.W = None
def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
batch_size=200, verbose=False):
"""
Train this linear classifier using stochastic gradient descent.
Inputs:
- X: A numpy array of shape (N, D) containing training data; there are N
training samples each of dimension D.
- y: A numpy array of shape (N,) containing training labels; y[i] = c
means that X[i] has label 0 <= c < C for C classes.
- learning_rate: (float) learning rate for optimization.
- reg: (float) regularization strength.
- num_iters: (integer) number of steps to take when optimizing
- batch_size: (integer) number of training examples to use at each step.
- verbose: (boolean) If true, print progress during optimization.
Outputs:
A list containing the value of the loss function at each training iteration.
"""
num_train, dim = X.shape
num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
if self.W is None:
# lazily initialize W
self.W = 0.001 * np.random.randn(dim, num_classes)
# Run stochastic gradient descent to optimize W
loss_history = []
for it in range(num_iters):
X_batch = None
y_batch = None
#########################################################################
# TODO:
# 从训练数据及其相应的标签中采样batch_size大小的样本,以用于本轮梯度下降。
# 将数据存储在X_batch中,并将其相应的标签存储在y_batch中:
# 采样后,X_batch的形状为(batch_size,dim),y_batch的形状(batch_size,)
#
# 提示:使用np.random.choice生成索引。 可重复的采样比不可重复的采样要快一点。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
indexes = np.random.choice(np.arange(num_train), batch_size, replace=True)
X_batch = X[indexes, :]
y_batch = y[indexes]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# evaluate loss and gradient
loss, grad = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)
# perform parameter update
#########################################################################
# TODO:
# 使用梯度和学习率更新权重。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
self.W -= learning_rate * grad
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
return loss_history
def predict(self, X):
"""
Use the trained weights of this linear classifier to predict labels for
data points.
Inputs:
- X: A numpy array of shape (N, D) containing training data; there are N
training samples each of dimension D.
Returns:
- y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
array of length N, and each element is an integer giving the predicted
class.
"""
y_pred = np.zeros(X.shape[0])
###########################################################################
# TODO:
# 实现此方法。将预测的标签存储在y_pred中。
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
scores = X.dot(self.W)
y_pred = np.argmax(scores, axis=1)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return y_pred
def loss(self, X_batch, y_batch, reg):
"""
Compute the loss function and its derivative.
Subclasses will override this.
Inputs:
- X_batch: A numpy array of shape (N, D) containing a minibatch of N
data points; each point has dimension D.
- y_batch: A numpy array of shape (N,) containing labels for the minibatch.
- reg: (float) regularization strength.
Returns: A tuple containing:
- loss as a single float
- gradient with respect to self.W; an array of the same shape as W
"""
pass
class LinearSVM(LinearClassifier):
""" A subclass that uses the Multiclass SVM loss function """
def loss(self, X_batch, y_batch, reg):
return svm_loss_vectorized(self.W, X_batch, y_batch, reg)
class Softmax(LinearClassifier):
""" A subclass that uses the Softmax + Cross-entropy loss function """
def loss(self, X_batch, y_batch, reg):
return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
daseCVclassifiers/linear_svm.py
from builtins import range
import numpy as np
from random import shuffle
# from past.builtins import xrange
def svm_loss_naive(W, X, y, reg):
"""
Structured SVM loss function, naive implementation (with loops).
Inputs have dimension D, there are C classes, and we operate on minibatches
of N examples.
Inputs:
- W: A numpy array of shape (D, C) containing weights.
- X: A numpy array of shape (N, D) containing a minibatch of data.
- y: A numpy array of shape (N,) containing training labels; y[i] = c means
that X[i] has label c, where 0 <= c < C.
- reg: (float) regularization strength
Returns a tuple of:
- loss as single float
- gradient with respect to weights W; an array of same shape as W
"""
dW = np.zeros(W.shape) # initialize the gradient as zero
# compute the loss and the gradient
num_classes = W.shape[1]
num_train = X.shape[0]
loss = 0.0
for i in range(num_train):
scores = X[i].dot(W)
correct_class_score = scores[y[i]]
for j in range(num_classes):
if j == y[i]:
continue
margin = scores[j] - correct_class_score + 1 # note delta = 1
if margin > 0:
loss += margin
dW[:,j] += X[i] # dW计算
dW[:,y[i]] += -X[i] # dW计算
# Right now the loss is a sum over all training examples, but we want it
# to be an average instead so we divide by num_train.
loss /= num_train
dW /= num_train # dW计算
# Add regularization to the loss.
loss += reg * np.sum(W * W)
dW += reg * 2 * W # dW计算
#############################################################################
# TODO:
# 计算损失函数的梯度并将其存储为dW。
# 与其先计算损失再计算梯度,还不如在计算损失的同时计算梯度更简单。
# 因此,您可能需要修改上面的一些代码来计算梯度。
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 前面已计算好
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, dW
def svm_loss_vectorized(W, X, y, reg):
"""
Structured SVM loss function, vectorized implementation.
Inputs and outputs are the same as svm_loss_naive.
"""
loss = 0.0
dW = np.zeros(W.shape) # initialize the gradient as zero
#############################################################################
# TODO:
# 实现一个向量化SVM损失计算方法,并将结果存储到loss中
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
num_train = X.shape[0]
scores = X.dot(W)
correct_scores = scores[range(num_train), y].reshape(-1, 1)
margin = np.maximum(0, scores - correct_scores + 1)
margin[range(num_train), y] = 0
loss += np.sum(margin) / num_train
loss += reg * np.sum(W * W)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#############################################################################
# TODO:
# 实现一个向量化的梯度计算方法,并将结果存储到dW中
#
# 提示:与其从头计算梯度,不如利用一些计算loss时的中间变量
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
margin[margin != 0] = 1
row_count = np.sum(margin, axis=1)
margin[range(num_train), y] -= row_count
dW += X.T.dot(margin) / num_train + reg * 2 * W
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, dW
daseCV/classifiers/neural_net.py
from __future__ import print_function
from builtins import range
from builtins import object
import numpy as np
import matplotlib.pyplot as plt
# from past.builtins import xrange
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network. The net has an input dimension of
N, a hidden layer dimension of H, and performs classification over C classes.
We train the network with a softmax loss function and L2 regularization on the
weight matrices. The network uses a ReLU nonlinearity after the first fully
connected layer.
In other words, the network has the following architecture:
input - fully connected layer - ReLU - fully connected layer - softmax
The outputs of the second fully-connected layer are the scores for each class.
"""
def __init__(self, input_size, hidden_size, output_size, std=1e-4):
"""
Initialize the model. Weights are initialized to small random values and
biases are initialized to zero. Weights and biases are stored in the
variable self.params, which is a dictionary with the following keys:
W1: First layer weights; has shape (D, H)
b1: First layer biases; has shape (H,)
W2: Second layer weights; has shape (H, C)
b2: Second layer biases; has shape (C,)
Inputs:
- input_size: The dimension D of the input data.
- hidden_size: The number of neurons H in the hidden layer.
- output_size: The number of classes C.
"""
self.params = {}
self.params['W1'] = std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
def loss(self, X, y=None, reg=0.0):
"""
Compute the loss and gradients for a two layer fully connected neural
network.
Inputs:
- X: Input data of shape (N, D). Each X[i] is a training sample.
- y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
an integer in the range 0 <= y[i] < C. This parameter is optional; if it
is not passed then we only return scores, and if it is passed then we
instead return the loss and gradients.
- reg: Regularization strength.
Returns:
If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
the score for class c on input X[i].
If y is not None, instead return a tuple of:
- loss: Loss (data loss and regularization loss) for this batch of training
samples.
- grads: Dictionary mapping parameter names to gradients of those parameters
with respect to the loss function; has the same keys as self.params.
"""
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
N, D = X.shape
# Compute the forward pass
scores = None
#############################################################################
# TODO: 执行向前传播,计算输入数据的每个类的score。
# 将结果存储在scores变量中,该变量应该是一个(N, C)维的数组。
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
output1 = X @ W1 + b1
output2 = np.maximum(0, output1)
scores = output2 @ W2 + b2
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# If the targets are not given then jump out, we're done
if y is None:
return scores
# Compute the loss
loss = None
#############################################################################
# TODO: 完成向前传播,计算损失。
# 这应该包括数据损失和W1和W2的L2正则化项。
# 将结果存储在变量loss中,它应该是一个标量。
# 使用Softmax损失函数。
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
exp_scores = np.exp(scores)
row_scores_sum = np.sum(exp_scores, axis=1).reshape(-1, 1)
label_scores = exp_scores[range(N), y].reshape(-1, 1)
p_scores = exp_scores / row_scores_sum
p_scores[range(N), y] -= 1
loss = -np.sum(np.log(label_scores / row_scores_sum)) / N + reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# Backward pass: compute gradients
grads = {}
#############################################################################
# TODO: 计算反向传播,计算权重和偏置值的梯度, 将结果存储在grads字典中。
# 例如,grads['W1']存储W1的梯度,并且和W1是相同大小的矩阵。
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
grads['W1'] = np.zeros_like(W1)
grads['b1'] = np.zeros_like(b1)
grads['W2'] = np.zeros_like(W2)
grads['b2'] = np.zeros_like(b2)
grads['W2'] += output2.T @ p_scores / N + 2 * reg * W2
grads['b2'] += np.sum(p_scores, axis=0) / N
dReLU = output2
dReLU[dReLU != 0] = 1
dOutput1 = p_scores @ W2.T * dReLU / N
grads['W1'] += X.T @ dOutput1 + 2 * reg * W1
grads['b1'] += np.sum(dOutput1, axis=0)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, grads
def train(self, X, y, X_val, y_val,
learning_rate=1e-3, learning_rate_decay=0.95,
reg=5e-6, num_iters=100,
batch_size=200, verbose=False):
"""
Train this neural network using stochastic gradient descent.
Inputs:
- X: A numpy array of shape (N, D) giving training data.
- y: A numpy array f shape (N,) giving training labels; y[i] = c means that
X[i] has label c, where 0 <= c < C.
- X_val: A numpy array of shape (N_val, D) giving validation data.
- y_val: A numpy array of shape (N_val,) giving validation labels.
- learning_rate: Scalar giving learning rate for optimization.
- learning_rate_decay: Scalar giving factor used to decay the learning rate
after each epoch.
- reg: Scalar giving regularization strength.
- num_iters: Number of steps to take when optimizing.
- batch_size: Number of training examples to use per step.
- verbose: boolean; if true print progress during optimization.
"""
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters in self.model
loss_history = []
train_acc_history = []
val_acc_history = []
for it in range(num_iters):
X_batch = None
y_batch = None
#########################################################################
# TODO: 创建一个随机的数据和标签的mini-batch,存储在X_batch和y_batch中。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
indexes = np.random.choice(np.arange(num_train), batch_size, replace=True)
X_batch = X[indexes, :]
y_batch = y[indexes]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
loss_history.append(loss)
#########################################################################
# TODO: 使用grads字典中的梯度来更新网络参数(参数存储在字典self.params中)
# 使用随机梯度下降法。
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
self.params['W1'] -= learning_rate * grads['W1']
self.params['b1'] -= learning_rate * grads['b1']
self.params['W2'] -= learning_rate * grads['W2']
self.params['b2'] -= learning_rate * grads['b2']
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
# Every epoch, check train and val accuracy and decay learning rate.
if it % iterations_per_epoch == 0:
# Check accuracy
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
train_acc_history.append(train_acc)
val_acc_history.append(val_acc)
# Decay learning rate
learning_rate *= learning_rate_decay
return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
}
def predict(self, X):
"""
Use the trained weights of this two-layer network to predict labels for
data points. For each data point we predict scores for each of the C
classes, and assign each data point to the class with the highest score.
Inputs:
- X: A numpy array of shape (N, D) giving N D-dimensional data points to
classify.
Returns:
- y_pred: A numpy array of shape (N,) giving predicted labels for each of
the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
to have class c, where 0 <= c < C.
"""
y_pred = None
###########################################################################
# TODO: Implement this function; it should be VERY simple! #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
scores = np.maximum(0, X @ self.params['W1'] + self.params['b1']) @ self.params['W2'] + self.params['b2']
y_pred = np.argmax(scores, axis=1)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return y_pred
from builtins import range
import numpy as np
from random import shuffle
# from past.builtins import xrange
def softmax_loss_naive(W, X, y, reg):
"""
Softmax loss function, naive implementation (with loops)
Inputs have dimension D, there are C classes, and we operate on minibatches
of N examples.
Inputs:
- W: A numpy array of shape (D, C) containing weights.
- X: A numpy array of shape (N, D) containing a minibatch of data.
- y: A numpy array of shape (N,) containing training labels; y[i] = c means
that X[i] has label c, where 0 <= c < C.
- reg: (float) regularization strength
Returns a tuple of:
- loss as single float
- gradient with respect to weights W; an array of same shape as W
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
#############################################################################
# TODO: 使用显式循环计算softmax损失及其梯度。
# 将损失和梯度分别保存在loss和dW中。
# 如果你不小心,很容易遇到数值不稳定的情况。
# 不要忘了正则化!
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
num_train = X.shape[0]
for i in range(num_train):
scores = X[i].dot(W)
scores = np.exp(scores).reshape(1, -1)
score_sum = np.sum(scores)
loss -= np.log(scores[0,y[i]] / score_sum)
# 原始式: -(score_sum / scores[y[i]]) * (scores[y[i]]*score_sum*X[i] - scores[y[i]]*scores[y[i]]*X[i]) / (score_sum**2)
# dW[:,y[i]] += -(score_sum - scores[0,y[i]]) / score_sum * X[i].T
# 原始式:-(score_sum / scores[y[i]]) * (-scores[y[i]] / (score_sum ** 2)) * X[i].T.dot(scores)
# dW += 1 / score_sum * X[i].T.reshape(-1, 1).dot(scores)
# y[i]列的dW已经在最开始统一计算,因此上面一步额外计算的y[i]列梯度要减去
# dW[:,y[i]] -= scores[0,y[i]] / score_sum * X[i].T
# 抵消dW[:,y[i]]的两项,得:
dW[:,y[i]] -= X[i].T
dW += 1 / score_sum * X[i].T.reshape(-1, 1).dot(scores)
loss /= num_train
dW /= num_train # dW计算
loss += reg * np.sum(W * W)
dW += reg * 2 * W # dW计算
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, dW
def softmax_loss_vectorized(W, X, y, reg):
"""
Softmax loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
#############################################################################
# TODO: 不使用显式循环计算softmax损失及其梯度。
# 将损失和梯度分别保存在loss和dW中。
# 如果你不小心,很容易遇到数值不稳定的情况。
# 不要忘了正则化!
#############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
num_train = X.shape[0]
scores = X.dot(W)
scores = np.exp(scores)
scores_sum = np.sum(scores, axis=1).reshape(-1, 1)
p_scores = scores / scores_sum
p_scores[range(num_train), y] -= 1
correct_class_scores = scores[range(num_train), y].reshape(-1, 1)
loss -= np.sum(np.log(correct_class_scores / scores_sum))
dW += X.T.dot(p_scores)
loss /= num_train
dW /= num_train # dW计算
loss += reg * np.sum(W * W)
dW += reg * 2 * W # dW计算
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, dW