单机多卡并行及多GPU训练

将ASIC设计为2D，为了深度学习内的矩阵运算；卷积也可以换成矩阵乘法。FPGA通常是做模拟的，做好后模拟成功进行ASIC留片。深度学习是计算密集型运算，GPU更适合计算密集型运算。四、Systolic Array。这是一种板子，可以进行烧制编程。一个PE里面可以做一个简单运算。一、DSP：数字信号处理。FFT：快速傅里叶变换。二、可编程阵列FPGA。

ccdous

725人浏览 · 2024-07-10 10:53:05

ccdous · 2024-07-10 10:53:05 发布

一、定义

1、单机多卡并行

2、数据并行与模型并行

数据并行通常是拿到各个梯度后加和得到总梯度

3、数据并行：

（1）读一个数据块

（2）拿回参数

（3）计算梯度

（4）发出梯度（加和）

（5）更新梯度

4、总结

二、多GPU训练

#向多个设备分发参数并附加梯度
def get_params(params, device):
    new_params = [p.to(device) for p in params]
    for p in new_params:
        p.requires_grad_()
    return new_params

new_params = get_params(params, d2l.try_gpu(0))
print('b1 权重:', new_params[1])
print('b1 梯度:', new_params[1].grad)

#跨多个设备对参数求和
def allreduce(data):
    for i in range(1, len(data)):
        #将所有结果加到[0]上
        data[0][:] += data[i].to(data[0].device)
    for i in range(1, len(data)):
        #把结果复制回去使每一层的参数都相等
        data[i][:] = data[0].to(data[i].device)

#数据分发
data = torch.arange(20).reshape(4, 5)
devices = [torch.device('cuda:0'), torch.device('cuda:1')]
split = nn.parallel.scatter(data, devices)

#@save
def split_batch(X, y, devices):
    """将X和y拆分到多个设备上"""
    assert X.shape[0] == y.shape[0]
    return (nn.parallel.scatter(X, devices),
            nn.parallel.scatter(y, devices))

#在一个小批量上实现多GPU训练
def train_batch(X, y, device_params, devices, lr):
    X_shards, y_shards = split_batch(X, y, devices)
    # 在每个GPU上分别计算损失
    ls = [loss(lenet(X_shard, device_W), y_shard).sum()
          for X_shard, y_shard, device_W in zip(
              X_shards, y_shards, device_params)]
    for l in ls:  # 反向传播在每个GPU上分别执行
        l.backward()
    # 将每个GPU的所有梯度相加，并将其广播到所有GPU
    with torch.no_grad():
        #i是层数
        for i in range(len(device_params[0])):
            allreduce(
                #将不同设备上的同一层数相加
                [device_params[c][i].grad for c in range(len(devices))])
    # 在每个GPU上分别更新模型参数
    for param in device_params:
        d2l.sgd(param, lr, X.shape[0]) # 在这里，我们使用全尺寸的小批量

#定义训练函数
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    devices = [d2l.try_gpu(i) for i in range(num_gpus)]
    # 将模型参数复制到num_gpus个GPU
    device_params = [get_params(params, d) for d in devices]
    num_epochs = 10
    animator = d2l.Animator('epoch', 'test acc', xlim=[1, num_epochs])
    timer = d2l.Timer()
    for epoch in range(num_epochs):
        timer.start()
        for X, y in train_iter:
            # 为单个小批量执行多GPU训练
            train_batch(X, y, device_params, devices, lr)
            torch.cuda.synchronize()
        timer.stop()
        # 在GPU0上评估模型
        animator.add(epoch + 1, (d2l.evaluate_accuracy_gpu(
            lambda x: lenet(x, device_params[0]), test_iter, devices[0]),))
    print(f'测试精度：{animator.Y[0][-1]:.2f}，{timer.avg():.1f}秒/轮，'
          f'在{str(devices)}')

三、简洁实现，用了这个DataParallel进行自动分发

选用ResNet-18，比上面的LeNet更适合做数据并行，因为这个训练的更慢，使用了更小的卷积核、步长和填充，而且删除了最大汇聚层。

def train(net, num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    devices = [d2l.try_gpu(i) for i in range(num_gpus)]
    def init_weights(m):
        if type(m) in [nn.Linear, nn.Conv2d]:
            nn.init.normal_(m.weight, std=0.01)
    net.apply(init_weights)
    # 在多个GPU上设置模型
    net = nn.DataParallel(net, device_ids=devices)
    trainer = torch.optim.SGD(net.parameters(), lr)
    loss = nn.CrossEntropyLoss()
    timer, num_epochs = d2l.Timer(), 10
    animator = d2l.Animator('epoch', 'test acc', xlim=[1, num_epochs])
    for epoch in range(num_epochs):
        net.train()
        timer.start()
        for X, y in train_iter:
            trainer.zero_grad()
            X, y = X.to(devices[0]), y.to(devices[0])
            l = loss(net(X), y)
            l.backward()
            trainer.step()
        timer.stop()
        animator.add(epoch + 1, (d2l.evaluate_accuracy_gpu(net, test_iter),))
    print(f'测试精度：{animator.Y[0][-1]:.2f}，{timer.avg():.1f}秒/轮，'
          f'在{str(devices)}')

四、总结

有多种方法可以在多个GPU上拆分深度网络的训练。拆分可以在层之间、跨层或跨数据上实现。前两者需要对数据传输过程进行严格编排（模型并行），而最后一种则是最简单的策略（数据并行）。
数据并行训练本身是不复杂的，它通过增加有效的小批量数据量的大小提高了训练效率。
在数据并行中，数据需要跨多个GPU拆分，其中每个GPU执行自己的前向传播和反向传播，随后所有的梯度被聚合为一，之后聚合结果向所有的GPU广播。
小批量数据量更大时，学习率也需要稍微提高一些。