本节主要是使用MindTorch在启智平台完成Pytorch的迁移任务。课程内容主要是通过实践案例来学习MindTorch的开发方法。

引言:MindTorch是一款将PyTorch训练脚本高效迁移至MindSpore框架执行的实用工具,旨在不改变原生PyTorch用户的编程使用习惯下,使得PyTorch风格代码能在昇腾硬件上获得高效性能。用户只需要在PyTorch源代码主入口调用torch系列相关的包导入部分(如torch、torchvision等)之前调用from mindtorch.tools import mstorch_enable,加上少量训练代码适配即可实现模型在昇腾硬件上的训练。

将现有PyTorch原生代码利用MindTorch移植至MindSpore时,当前通常需要如下几个步骤:

  • 使用MindTorch(必选)
  • 适配微分接口(训练场景)
  • 适配优化器和学习率接口(训练场景)
  • 适配混合精度接口(训练场景)
  • 适配分布式接口(可选)
  • 模型加载和保存(可选)

详细的用户手册可以参考MindTorch用户指南

实验1

from mindtorch.tools import mstorch_enable
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# 1.Working with data
# Download training data from open datasets.
training_data = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
# Download test data from open datasets.
test_data = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())


# 2.Creating Models
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

#         if batch % 10 == 0:
        loss, current = loss.item(), (batch + 1) * len(X)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test(dataloader, model, loss_fn, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


if __name__ == '__main__':
    train_dataloader = DataLoader(training_data, batch_size=64)
    test_dataloader = DataLoader(test_data, batch_size=64)
    # Get cpu, gpu or mps device for training.
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    model = NeuralNetwork().to(device)

    # 3.Optimizing the Model Parameters
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

    epochs = 5
    for t in range(epochs):
        print(f"Epoch {t + 1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer, device)
        test(test_dataloader, model, loss_fn, device)
    print("Done!")

    # 4.Saving Models
    torch.save(model.state_dict(), "model.pth")
    print("Saved PyTorch Model State to model.pth")

    # 5.Loading Models
    model = NeuralNetwork().to(device)
    model.load_state_dict(torch.load("model.pth"))

    classes = [
        "T-shirt/top",
        "Trouser",
        "Pullover",
        "Dress",
        "Coat",
        "Sandal",
        "Shirt",
        "Sneaker",
        "Bag",
        "Ankle boot",
    ]
    # 6.Predicted
    model.eval()
    x, y = test_data[0][0], test_data[0][1]
    with torch.no_grad():
        x = x.to(device)
        pred = model(x)
        predicted, actual = classes[pred[0].argmax(0)], classes[y]
        print(f'Predicted: "{predicted}", Actual: "{actual}"')

运行结果1

在这里插入图片描述

实验2

from mindtorch.tools import mstorch_enable
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import mindspore as ms
import argparse
import time

### data prepare
transform = transforms.Compose([transforms.Resize((224, 224), interpolation=InterpolationMode.BICUBIC),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.2435, 0.2616])
                               ])


### model construct
class AlexNet(nn.Module):
    def __init__(self, num_classes: int = 10) -> None:
        super(AlexNet, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, (11, 11), (4, 4), (2, 2)),
            nn.ReLU(),
            nn.MaxPool2d((3, 3), (2, 2)),

            nn.Conv2d(64, 192, (5, 5), (1, 1), (2, 2)),
            nn.ReLU(),
            nn.MaxPool2d((3, 3), (2, 2)),

            nn.Conv2d(192, 384, (3, 3), (1, 1), (1, 1)),
            nn.ReLU(),
            nn.Conv2d(384, 256, (3, 3), (1, 1), (1, 1)),
            nn.ReLU(),
            nn.Conv2d(256, 256, (3, 3), (1, 1), (1, 1)),
            nn.ReLU(),
            nn.MaxPool2d((3, 3), (2, 2)),
        )

        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        return self._forward_impl(x)

    # Support torch.script function
    def _forward_impl(self, x):
        out = self.features(x)
        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out

criterion = nn.CrossEntropyLoss()


################# model train ###############
def train(config_args):
    train_images = datasets.CIFAR10(config_args.dataset, train=True, download=True, transform=transform)
    train_data = DataLoader(train_images, batch_size=16, shuffle=True, num_workers=0, drop_last=True)

    epochs = config_args.epoch
    net = AlexNet().to(config_args.device)
    optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)

    def forward_fn(data, label):
        logits = net(data)
        loss = criterion(logits, label)
        return loss, logits

    grad_fn = ms.ops.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)

    def train_net(data, label):
        (loss, _), grads = grad_fn(data, label)
        optimizer(grads)
        return loss

    net.train()
    print("begin training ......")
    for i in range(epochs):
        epoch_begin = time.time()
        for X, y in train_data:
            res = train_net(X, y)
            print("---------------------->epoch:{}, loss:{:.6f}".format(i, res.asnumpy()))
        print("--------------->epoch:{}, total time:{:.6f}".format(i, time.time() - epoch_begin))
    torch.save(net.state_dict(), config_args.save_path)


################# model eval ###############
def test(config_args):
    test_images = datasets.CIFAR10(config_args.dataset, train=False, download=True, transform=transform)
    test_data = DataLoader(test_images, batch_size=128, shuffle=True, num_workers=4, drop_last=True)

    net = AlexNet().to(config_args.device)
    net.load_state_dict(torch.load(config_args.load_path), strict=True)
    size = len(test_data.dataset)
    num_batches = len(test_data)
    net.eval()
    test_loss, correct = 0, 0
    print("begin testing ......")
    with torch.no_grad():   # comment out this line for graph mode accelerating
        for X, y in test_data:
            X, y = X.to(config_args.device), y.to(config_args.device)
            pred = net(X)
            test_loss += criterion(pred, y).item()
            correct += (pred.argmax(1) == y).to(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


if __name__ == "__main__":
    seed = 1
    torch.manual_seed(seed)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', type=str, default='train', help='Execute training or testing.')
    parser.add_argument('--device', type=str, default='Ascend', help='Select the hardware device for execution.')
    parser.add_argument('--epoch', type=int, default=20, help='Epoch size of training.')
    parser.add_argument('--save_path', type=str, default='./alexnet.pth', help='Training output path for local.')
    parser.add_argument('--load_path', type=str, default='./alexnet.pth',
                        help='Pretrained checkpoint path for fine tune or evaluating.')
    parser.add_argument('--dataset', default='./', help='Dataset root directory path')
    config_args = parser.parse_args()


    if config_args.device in ("gpu", "GPU", "cuda"):
        ms.context.set_context(device_target="GPU")
    elif config_args.device in ("cpu", "CPU"):
        ms.context.set_context(device_target="CPU")
    elif config_args.device == "Ascend":
        ms.context.set_context(device_target="Ascend")
    else:
        print("WARNING: '--device' configuration is abnormal, and the appropriate device will be adapted.")

    # for graph mode accelerating
    # ms.context.set_context(mode=ms.GRAPH_MODE)
    # ms.set_context(jit_syntax_level=ms.STRICT)

    if config_args.mode == 'train':
        train(config_args)
    elif config_args.mode == 'test':
        test(config_args)

运行结果2

在这里插入图片描述

实验3

from mindtorch.tools import mstorch_enable  # 使用mindtorch时启用
import torch
from torch import nn
from mindtorch.tools import debug_layer_info

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        print(x.shape)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork()
#torch.save(model.state_dict(), 'parameters.pth')   # 首次保存模型参数时候启用
model.load_state_dict(torch.load('parameters.pth'))  
model.eval()

#debug_layer_info(model, frame='pytorch')    # 使用pytorch时启用
debug_layer_info(model)                      # 使用mindtorch时启用

input = torch.ones((3, 28, 28))
output = model(input)

运行结果3

在这里插入图片描述
在这里插入图片描述

Logo

昇腾计算产业是基于昇腾系列(HUAWEI Ascend)处理器和基础软件构建的全栈 AI计算基础设施、行业应用及服务,https://devpress.csdn.net/organization/setting/general/146749包括昇腾系列处理器、系列硬件、CANN、AI计算框架、应用使能、开发工具链、管理运维工具、行业应用及服务等全产业链

更多推荐