Ascend910B 自定义Sigmoid算子开发关键要点

链接: https://pan.baidu.com/s/1iUm4BAQ7DHaJIQHUCeUSUg?pwd=jhqu

初级认证
中级认证

环境说明

本硬件平台为Ascend910B;若使用Ascend310B平台,需对应适配调整相关配置。

步骤

一、环境配置

  1. 环境重启后需重新加载配置:source ~/.bashrc

  2. 加载Ascend工具链环境变量:source /home/ma-user/Ascend/ascend-toolkit/set_env.sh

二、需修文件

开发过程中需精准修改以下4个文件,所有文件均需完成配置,否则将导致最终结果异常:

- SigmoidCustom/SigmoidCustom/CMakePresets.json

- SigmoidCustom/SigmoidCustom/op_host/sigmoid_custom_tiling.h

- SigmoidCustom/SigmoidCustom/op_host/sigmoid_custom.cpp

- SigmoidCustom/SigmoidCustom/op_kernel/sigmoid_custom.cpp

三、基础代码实现

基础代码聚焦于以下两个文件,逻辑相对易懂,主要完成tiling配置与基础参数初始化:

SigmoidCustom/SigmoidCustom/op_host/sigmoid_custom_tiling.h
SigmoidCustom/SigmoidCustom/op_host/sigmoid_custom.cpp
1. sigmoid_custom_tiling.h 新增内容
TILING_DATA_FIELD_DEF(uint32_t, totalLength);
TILING_DATA_FIELD_DEF(uint32_t, tileNum);
2. sigmoid_custom.cpp 新增内容
const uint32_t BLOCK_DIM = 8;
const uint32_t TILE_NUM = 8;
uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
context->SetBlockDim(BLOCK_DIM);
tiling.set_totalLength(totalLength);
tiling.set_tileNum(TILE_NUM);
tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), 
context->GetRawTilingData()->GetCapacity());
context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
size_t *currentWorkspace = context->GetWorkspaceSizes(1);
currentWorkspace[0] = 0;
return ge::GRAPH_SUCCESS;

四、核心计算逻辑说明

直接使用内置的 Sigmoid 融合指令:

  1. 精度更高,避免了手动 Exp/Adds/Reciprocal 的累积误差
  2. 性能更好,硬件层面有专用指令优化

#include "kernel_operator.h"

using namespace AscendC;

// 假设 DTYPE_X 和 DTYPE_Y 在编译宏中已定义为 half (fp16)
constexpr int32_t BUFFER_NUM = 2;

class KernelSigmoid {
public:
    __aicore__ inline KernelSigmoid() {}

    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, uint32_t totalLength, uint32_t tileNum)
    {
        // 1. 计算每个核处理的数据长度
        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
        this->blockLength = totalLength / GetBlockNum();
        this->tileNum = tileNum;
        
        ASSERT(tileNum != 0 && "tile num can not be zero!");
        // 2. 计算每一份 Tile 的长度(分两次 Buffer 轮转)
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;

        // 3. 设置 Global Buffer 偏移
        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * GetBlockIdx(), this->blockLength);
        yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * GetBlockIdx(), this->blockLength);

        // 4. 初始化 Pipe 缓冲区
        // 注意:使用内置 Sigmoid 指令不再需要手动申请多个 tmpBuffer
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y));
    }

    __aicore__ inline void Process()
    {
        // 计算总的循环次数:Tile数量 * Buffer数量
        int32_t loopCount = this->tileNum * BUFFER_NUM;
        for (int32_t i = 0; i < loopCount; i++) {
            CopyIn(i);
            Compute(i);
            CopyOut(i);
        }
    }

private:
    __aicore__ inline void CopyIn(int32_t progress)
    {
        LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
        // 将数据从 Global Memory 拷贝到 Local Memory
        DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
        inQueueX.EnQue(xLocal);
    }

    __aicore__ inline void Compute(int32_t progress)
    {
        // 1. 从输入队列取出数据
        LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
        
        // 2. 分配输出 LocalTensor
        LocalTensor<DTYPE_Y> yLocal = outQueueY.AllocTensor<DTYPE_Y>();
        
        /**
         * 核心修改:直接使用内置的 Sigmoid 融合指令
         * 优点:1. 精度更高,避免了手动 Exp/Adds/Reciprocal 的累积误差
         *      2. 性能更好,硬件层面有专用指令优化
         */
        Sigmoid(yLocal, xLocal, this->tileLength);
        
        // 3. 结果入队
        outQueueY.EnQue(yLocal);
        
        // 4. 释放输入 Tensor
        inQueueX.FreeTensor(xLocal);
    }

    __aicore__ inline void CopyOut(int32_t progress)
    {
        LocalTensor<DTYPE_Y> yLocal = outQueueY.DeQue<DTYPE_Y>();
        // 将结果写回 Global Memory
        DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
        outQueueY.FreeTensor(yLocal);
    }

private:
    TPipe pipe;
    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY;
    GlobalTensor<DTYPE_X> xGm;
    GlobalTensor<DTYPE_Y> yGm;

    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
};

extern "C" __global__ __aicore__ void sigmoid_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) {
    GET_TILING_DATA(tiling_data, tiling);
    KernelSigmoid op;
    op.Init(x, y, tiling_data.totalLength, tiling_data.tileNum);
    op.Process();
}

五、权限配置步骤

SigmoidCustom/SigmoidCustom/目录下执行以下命令,赋予所有文件完整操作权限:


chmod +x -R *

最后

按考试题步骤编译即可,成功之后把源码提交,最好删注释

Logo

昇腾计算产业是基于昇腾系列(HUAWEI Ascend)处理器和基础软件构建的全栈 AI计算基础设施、行业应用及服务,https://devpress.csdn.net/organization/setting/general/146749包括昇腾系列处理器、系列硬件、CANN、AI计算框架、应用使能、开发工具链、管理运维工具、行业应用及服务等全产业链

更多推荐