# 【2.4.1.1】tensorflow多GPU的实现-1

1. 复习训练神经网络中用到的基本概念，并为本课程建立通用的术语。
2. 阐述神经网络训练中涉及的基本原理（特别是随机梯度下降法）。
3. 为本课程的后续实验课打下基础，帮助我们逐步过渡到用多 GPU 实现神经网络。

## 一、训练神经网络

### 1.1 生成随机数据集

# Numpy is a fundamental package for scientific computing. It contains, among many, an implementation of an array
# that we will use in this exercise.
import numpy as np
# We will be generating our own RANDOM dataset. As a consequence we need functionality to generate random numbers.
import random
# We will be plotting the progress of training as well as the behaviour of our training algorithm
# hence MatPlotLib. A python package that can be used to generate 2D and 3D plots.
import matplotlib.pyplot as plt
# TensorFlow - so the deep learning framework of choice for this class.
import tensorflow as tf


# Define the number of samples/data points you want to generate
n_samples = 100
# We will define a dataset that lies on a line as defined by y = w_gen * x + b_gen
w_gen = 10
b_gen = 2
# To make the problem a bit more interesting we will add some Gaussian noise as
# defined by the mean and standard deviation below.
mean_gen = 0
std_gen = 1
# This section generates the training dataset as defined by the variables in the section above.
x = np.float32(np.random.uniform(0, 10, n_samples))
y = np.float32(np.array([w_gen * (x + np.random.normal(loc=mean_gen, scale=std_gen, size=None)) + b_gen for x in x]))
# Plot our randomly generated dataset
plt.close()
plt.plot(x, y, 'go')
plt.xlabel("x", size=24)
plt.ylabel("y", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()


### 1.2 定义模型

1.创建模型的定义 2. 确定指导我们训练过程的损失函数。损失函数告诉我们，优化算法在模型的训练中取得的进展。它实际上定义了训练成功与否。 3. 然后迭代地进行以下操作：

• 计算损失函数相对于模型权重的梯度。
• 用这个梯度来更新模型权重，以将损失函数最小化。

# Define the TensorFlow variables based on our inputs
X = tf.Variable(x, name="X")
Y = tf.Variable(y, name="Y")
​
# Create our model variables w (weights; this is intended to map to the slope, w_gen) and b (bias; this maps to the intercept, b_gen).
# For simplicity, we initialize the data to zero.
w = tf.Variable(np.float32(0.0), name="weights")
b = tf.Variable(np.float32(0.0), name="bias")
​
# Define our model. We are implementing a simple linear neuron as per the diagram shown above.
@tf.function
def forward(x):
return w * x + b


### 1.3 定义损失函数

Image
# We define the loss function which is an indicator of how good or bad our model is at any point of time.
loss_fn = tf.keras.losses.MeanSquaredError()


### 1.4 定义优化逻辑 - 梯度下降

# Define a gradient descent optimizer
# Note that the "SGD" optimizer is simple gradient descent if applied
# to the full dataset, and stochastic gradient descent if applied to
# random subsets of the dataset
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)


### 1.5 训练过程的循环

# Define the maximum number of times we want to process the entire dataset (the number of epochs).
# In practice we won't run this many because we'll implement an early stopping condition that
# detects when the training process has converged.
max_number_of_epochs = 1000
​
# We will store information about the optimization process here.
loss_array = []
b_array = []
w_array = []
​
# Zero out the initial values
w.assign(0.0)
b.assign(0.0)
​
# Print out the parameters and loss before we do any training
Y_predicted = forward(X)
loss_value = loss_fn(Y_predicted, Y)
print("Before training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w.numpy(), b.numpy(), loss_value))
​
print("")
print("Starting training")
print("")
​
# Start the training process
for i in range(max_number_of_epochs):
​
# Use the entire dataset to calculate the gradient and update the parameters
with tf.GradientTape() as tape:
Y_predicted = forward(X)
loss_value = loss_fn(Y_predicted, Y)
​
optimizer.minimize(loss_value, var_list=[w, b], tape=tape)
​
# Capture the data that we will use in our visualization
w_array.append(w.numpy())
b_array.append(b.numpy())
loss_array.append(loss_value)
​
if (i + 1) % 5 == 0:
print("Epoch = {:2d}: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(i+1, w.numpy(), b.numpy(), loss_value))
​
# Implement your convergence check here, and exit the training loop if
# you detect that we are converged:
if FIXME: # TODO
break
​
print("")
print("Training finished after {} epochs".format(i+1))
print("")
​
print("After training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w.numpy(), b.numpy(), loss_value))


if i > 1 and abs(loss_array[i] - loss_array[i-1]) / loss_array[i-1] < 0.001:


plt.close()
plt.plot(loss_array)
plt.xlabel("Epoch", size=24)
plt.ylabel("Loss", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()


### 1.6 在三维空间中观察模型的损失的运动轨迹

from mpl_toolkits.mplot3d import Axes3D
​
fig = plt.figure()
ax = fig.gca(projection='3d')
​
ax.scatter(w_array, b_array, loss_array)
​
ax.set_xlabel('w', size=16)
ax.set_ylabel('b', size=16)
ax.tick_params(labelsize=12)
​
plt.show()


loss_surface = []
w_surface = []
b_surface = []
​
for w_value in np.linspace(0, 20, 200):
for b_value in np.linspace(-18, 22, 200):
​
# Collect information about the loss function surface
w.assign(w_value)
b.assign(b_value)
​
Y_predicted = forward(X)
loss_value = loss_fn(Y_predicted, Y)
​
b_surface.append(b_value)
w_surface.append(w_value)
loss_surface.append(loss_value)
plt.close()
​
fig = plt.figure()
ax2 = fig.gca(projection='3d')
​
ax2.scatter(w_surface, b_surface, loss_surface, c = loss_surface, alpha = 0.02)
ax2.plot(w_array, b_array, loss_array, color='black')
​
ax2.set_xlabel('w')
ax2.set_ylabel('b')
​
plt.show()


## 二、 随机梯度下降法

### 2.1 实现随机梯度下降：第一种方法

# Define the maximum number of times we want to process the entire dataset (the number of epochs).
# In practice we won't run this many because we'll implement an early stopping condition that
# detects when the training process has converged.
max_number_of_epochs = 1000
​
# We will store information about the optimization process here.
loss_array = []
b_array = []
w_array = []
​
# Zero out the initial values
w.assign(0.0)
b.assign(0.0)
​
# Print out the parameters and loss before we do any training
Y_predicted = forward(X)
loss_value = loss_fn(Y_predicted, Y)
print("Before training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w.numpy(), b.numpy(), loss_value))
​
print("")
print("Starting training")
print("")
​
# Start the training process
for i in range(max_number_of_epochs):
​
# Update after every data point
for (x_pt, y_pt) in zip(x, y):
​
with tf.GradientTape() as tape:
y_predicted = forward(x_pt)
loss_value = loss_fn([y_predicted], [y_pt])
​
optimizer.minimize(loss_value, var_list=[w, b], tape=tape)
​
# Capture the data that we will use in our visualization
# Note that we are now updating our loss function after
# every point in the sample, so the size of loss_array
# will be greater by a factor of n_samples compared to
# the last exercise.
w_array.append(w.numpy())
b_array.append(b.numpy())
loss_array.append(loss_value)
​
# At the end of every epoch after the first, print out the learned weights
if i > 0:
avg_w = sum(w_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_b = sum(b_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_loss = sum(loss_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
print("Epoch = {:2d}: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(i+1, avg_w, avg_b, avg_loss))
​
# End the training when the loss function has not changed from the last epoch
# by more than a small amount. Note that in our convergence check we will compare
# the loss averaged over this epoch with the loss averaged over the last epoch.
if i > 1:
average_loss_this_epoch = sum(loss_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
average_loss_last_epoch = sum(loss_array[(i-2)*n_samples:(i-1)*n_samples]) / n_samples
if abs(average_loss_this_epoch - average_loss_last_epoch) / average_loss_last_epoch < 0.001:
break
​
print("")
print("Training finished after {} epochs".format(i+1))
print("")

avg_w = sum(w_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_b = sum(b_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_loss = sum(loss_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples

print("After training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(avg_w, avg_b, avg_loss))


plt.close()
plt.plot(loss_array)
plt.xlabel("Number of Updates", size=24)
plt.ylabel("Loss", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()


from mpl_toolkits.mplot3d import Axes3D
​
plt.close()
​
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(w_array, b_array, loss_array)
​
ax.set_xlabel('w', size=16)
ax.set_ylabel('b', size=16)
ax.tick_params(labelsize=12)
​
plt.show()


plt.close()
​
fig = plt.figure()
ax2 = fig.gca(projection='3d')
​
ax2.scatter(w_surface, b_surface, loss_surface, c = loss_surface, alpha = 0.02)
ax2.plot(w_array, b_array, loss_array, color='black')
​
ax2.set_xlabel('w')
ax2.set_ylabel('b')
​
plt.show()


### 2.2 在最简单的神经网络模型中试验批量大小

# Define the maximum number of times we want to process the entire dataset (the number of epochs).
# In practice we won't run this many because we'll implement an early stopping condition that
# detects when the training process has converged.
max_number_of_epochs = 1000
​
# We still store information about the optimization process here.
loss_array = []
b_array = []
w_array = []
​
# Zero out the initial values
w.assign(0.0)
b.assign(0.0)
​
# Print out the parameters and loss before we do any training
Y_predicted = forward(X)
loss_value = loss_fn(Y_predicted, Y)
print("Before training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w.numpy(), b.numpy(), loss_value))
​
print("")
print("Starting training")
print("")
​
# Pass in batches of the dataset
# After the first run, try batch sizes of 16, 64, and 128
batch_size = 32
num_batches_in_epoch = FIXME
​
# Start the training process
for i in range(max_number_of_epochs):
​
for j in range(num_batches_in_epoch):
batch_start = FIXME
batch_end = FIXME
​
with tf.GradientTape() as tape:
y_predicted = forward(x[batch_start:batch_end])
loss_value = loss_fn(y_predicted, y[batch_start:batch_end])
​
optimizer.minimize(loss_value, var_list=[w, b], tape=tape)
​
# Capture the data that we will use in our visualization
w_array.append(w.numpy())
b_array.append(b.numpy())
loss_array.append(loss_value)
​
# At the end of every epoch after the first, print out the learned weights
if i > 0:
avg_w = sum(w_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_b = sum(b_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_loss = sum(loss_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
print("Epoch = {:2d}: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(i+1, avg_w, avg_b, avg_loss))
​
# End the training when the loss function has not changed from the last epoch
# by more than a small amount. Note that in our convergence check we will compare
# the loss averaged over this epoch with the loss averaged over the last epoch.
if i > 1:
average_loss_this_epoch = sum(loss_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
average_loss_last_epoch = sum(loss_array[(i-2)*num_batches_in_epoch:(i-1)*num_batches_in_epoch]) / num_batches_in_epoch
if abs(average_loss_this_epoch - average_loss_last_epoch) / average_loss_last_epoch < 0.001:
break
​
print("")
print("Training finished after {} epochs".format(i+1))
print("")
​
avg_w = sum(w_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_b = sum(b_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_loss = sum(loss_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
​
print("After training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(avg_w, avg_b, avg_loss))


num_batches_in_epoch = (n_samples + batch_size - 1) // batch_size

...

for j in range(num_batches_in_epoch):
batch_start = j * batch_size
batch_end = min((j + 1) * batch_size, n_samples)


plt.close()
plt.plot(loss_array)
plt.xlabel("Number of Updates", size=24)
plt.ylabel("Loss", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()
from mpl_toolkits.mplot3d import Axes3D
​
plt.close()
​
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(w_array, b_array, loss_array)
​
ax.set_xlabel('w', size=16)
ax.set_ylabel('b', size=16)
ax.tick_params(labelsize=12)
​
plt.show()
plt.close()
​
fig = plt.figure()
ax2 = fig.gca(projection='3d')
​
ax2.scatter(w_surface, b_surface, loss_surface, c = loss_surface, alpha = 0.02)
ax2.plot(w_array, b_array, loss_array, color='black')
​
ax2.set_xlabel('w')
ax2.set_ylabel('b')
​
plt.show()


## 三、总结

1. 神经网络训练过程的基本组成部分
2. 如何实现梯度下降优化器
3. 梯度下降法和随机梯度下降法之间的差别，以及二者对优化过程的影响
4. 批量（batch size）如何影响训练的精度

import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)


## 参考资料

• Nividia的课件《用多 GPU 训练神经网络》