CANN计算机视觉加速利器:ops-cv图像处理与目标检测算子库详解

一、项目简介

ops-cv 是CANN提供的图像处理、目标检测相关的算子库,专门为计算机视觉任务设计的高性能算子集合。随着计算机视觉技术在自动驾驶、智能安防、医疗影像等领域的广泛应用,对图像处理和目标检测算子的性能需求日益增长。ops-cv算子库涵盖了图像预处理、特征提取、目标检测、图像变换等各类CV核心算子,为计算机视觉应用在NPU上的高效运行提供关键支持。

该算子库针对图像数据的特点进行了深度优化,包括数据格式转换、色彩空间转换、几何变换、特征图处理等,能够显著提升CV模型的推理性能。

相关链接:

  • CANN组织链接:https://atomgit.com/cann
  • ops-cv仓库链接:https://atomgit.com/cann/ops-cv

二、核心功能与特性

2.1 算子分类

ops-cv算子库按照功能可分为以下几大类:

算子类别 主要功能 应用场景
图像预处理 Resize、Crop、Pad、Normalize等 数据增强、模型输入准备
色彩空间转换 RGB2BGR、RGB2YUV、RGB2GRAY等 图像格式转换
几何变换 Affine、Perspective、Rotate、Flip等 数据增强、姿态校正
特征提取 Conv2D、Dilation、Erosion、Sobel等 边缘检测、特征提取
目标检测 NMS、ROIAlign、ROIPooling等 检测后处理
图像滤波 GaussianBlur、MedianBlur、Bilateral等 噪声去除、图像平滑
形态学操作 Open、Close、Gradient等 形状分析、特征增强

2.2 技术特性

  1. 多格式支持:支持NCHW、NHWC、HWC等多种数据布局
  2. 多精度计算:支持FP32、FP16、UINT8等多种数据类型
  3. 硬件加速:针对NPU的向量计算单元进行优化
  4. 零拷贝优化:支持原地计算,减少内存拷贝开销
  5. 批处理优化:支持多图像批量处理

三、环境准备

3.1 系统要求

  • 操作系统:Ubuntu 18.04/20.04/22.04
  • 处理器:Atlas 200I/300T/800T系列
  • CANN版本:CANN 8.0.RC3及以上
  • OpenCV:3.4+ 或 4.x(用于数据对比验证)

3.2 依赖安装

# 安装OpenCV(用于对比验证)
sudo apt-get install -y libopencv-dev python3-opencv

# 克隆ops-cv仓库
git clone https://atomgit.com/cann/ops-cv.git
cd ops-cv

# 编译安装
mkdir build && cd build
cmake .. \
    -DCMAKE_BUILD_TYPE=Release \
    -DCANN_INSTALL_PATH=/usr/local/Ascend \
    -DWITH_OPENCV=ON
make -j$(nproc)
make install

四、图像预处理算子示例

4.1 图像缩放(Resize)

图像缩放是计算机视觉中最常用的预处理操作之一:

#include "cv_ops.h"

extern "C" __global__ __aicore__ void image_resize(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [new_h, new_w, C]
    uint32_t src_h,
    uint32_t src_w,
    uint32_t channels,
    uint32_t dst_h,
    uint32_t dst_w,
    InterpolationMode mode  // 插值模式: NEAREST, BILINEAR, BICUBIC
)
{
    // 定义Tensor
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    // 分配Local Tensor
    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的缩放算子
    Resize<uint8_t> resize_op;
    resize_op.Init(src_h, src_w, channels, dst_h, dst_w, mode);

    // 执行缩放操作
    resize_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, dst_h * dst_w * channels);

    // 释放资源
    input_local.Free();
    output_local.Free();
}

4.2 图像裁剪(Crop)

extern "C" __global__ __aicore__ void image_crop(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [crop_h, crop_w, C]
    uint32_t src_h,
    uint32_t src_w,
    uint32_t channels,
    uint32_t crop_top,
    uint32_t crop_left,
    uint32_t crop_h,
    uint32_t crop_w
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local;
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的裁剪算子
    Crop<uint8_t> crop_op;
    crop_op.Init(src_h, src_w, channels, crop_top, crop_left, crop_h, crop_w);

    // 计算输入偏移量
    uint32_t input_offset = (crop_top * src_w + crop_left) * channels;

    // 执行裁剪操作(直接内存拷贝)
    for (uint32_t row = 0; row < crop_h; row++) {
        uint32_t src_offset = input_offset + row * src_w * channels;
        uint32_t dst_offset = row * crop_w * channels;

        DataCopy(output_local,
                input_gm + src_offset,
                crop_w * channels);
        DataCopy(output_gm + dst_offset,
                output_local,
                crop_w * channels);
    }

    input_local.Free();
    output_local.Free();
}

4.3 图像填充(Pad)

extern "C" __global__ __aicore__ void image_pad(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [H+pad_h*2, W+pad_w*2, C]
    uint32_t src_h,
    uint32_t src_w,
    uint32_t channels,
    uint32_t pad_top,
    uint32_t pad_bottom,
    uint32_t pad_left,
    uint32_t pad_right,
    uint8_t pad_value     // 填充值
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local;
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    uint32_t dst_h = src_h + pad_top + pad_bottom;
    uint32_t dst_w = src_w + pad_left + pad_right;

    // 使用ops-cv提供的填充算子
    Pad<uint8_t> pad_op;
    pad_op.Init(src_h, src_w, channels, pad_top, pad_bottom, pad_left, pad_right, pad_value);

    // 执行填充操作
    pad_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, dst_h * dst_w * channels);

    input_local.Free();
    output_local.Free();
}

4.4 图像归一化(Normalize)

extern "C" __global__ __aicore__ void image_normalize(
    GM_ADDR input_gm,      // 输入图像: [H, W, C], uint8
    GM_ADDR output_gm,     // 输出图像: [H, W, C], float
    GM_ADDR mean_gm,       // 均值: [C]
    GM_ADDR std_gm,        // 标准差: [C]
    uint32_t h,
    uint32_t w,
    uint32_t channels
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);
    Tensor mean = Tensor(mean_gm);
    Tensor std = Tensor(std_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<float> output_local = output.Out<float>();
    LocalTensor<float> mean_local = mean.In<float>();
    LocalTensor<float> std_local = std.In<float>();

    // 转换为float并归一化: (x - mean) / std
    const uint32_t block_size = 512;
    for (uint32_t i = 0; i < h * w; i += block_size) {
        uint32_t current_size = (i + block_size) > (h * w) ? (h * w - i) : block_size;

        // 加载图像块
        DataCopy(input_local, input_gm + i * channels, current_size * channels);

        // 逐通道处理
        for (uint32_t c = 0; c < channels; c++) {
            // 获取当前通道的均值和标准差
            float mean_val = mean_local.GetValue(c);
            float std_val = std_local.GetValue(c);

            // 处理该通道的所有像素
            for (uint32_t j = 0; j < current_size; j++) {
                // 转换为float [0, 1]
                float val = static_cast<float>(input_local.GetValue(j * channels + c)) / 255.0f;

                // 归一化
                float normalized = (val - mean_val) / std_val;

                // 存储结果
                output_local.SetValue(j * channels + c, normalized);
            }
        }

        // 写回结果
        DataCopy(output_gm + i * channels * sizeof(float),
                output_local,
                current_size * channels);
    }

    input_local.Free();
    output_local.Free();
    mean_local.Free();
    std_local.Free();
}

五、色彩空间转换示例

5.1 RGB转GRAY(灰度化)

extern "C" __global__ __aicore__ void rgb2gray(
    GM_ADDR input_gm,      // 输入: [H, W, 3], RGB
    GM_ADDR output_gm,     // 输出: [H, W], 灰度
    uint32_t h,
    uint32_t w
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的RGB转灰度算子
    // 灰度 = 0.299*R + 0.587*G + 0.114*B
    RGB2Gray<uint8_t> rgb2gray_op;
    rgb2gray_op.Init(h, w);
    rgb2gray_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, h * w);

    input_local.Free();
    output_local.Free();
}

5.2 RGB转YUV

extern "C" __global__ __aicore__ void rgb2yuv(
    GM_ADDR input_gm,      // 输入: [H, W, 3], RGB
    GM_ADDR output_gm,     // 输出: [H, W, 3], YUV
    uint32_t h,
    uint32_t w
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的RGB转YUV算子
    // Y = 0.299*R + 0.587*G + 0.114*B
    // U = -0.14713*R - 0.28886*G + 0.436*B + 128
    // V = 0.615*R - 0.51499*G - 0.10001*B + 128
    RGB2YUV<uint8_t> rgb2yuv_op;
    rgb2yuv_op.Init(h, w);
    rgb2yuv_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, h * w * 3);

    input_local.Free();
    output_local.Free();
}

5.3 BGR转RGB

extern "C" __global__ __aicore__ void bgr2rgb(
    GM_ADDR input_gm,      // 输入: [H, W, 3], BGR
    GM_ADDR output_gm,     // 输出: [H, W, 3], RGB
    uint32_t h,
    uint32_t w
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local;
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 分块处理
    const uint32_t block_size = 512;
    for (uint32_t i = 0; i < h * w; i += block_size) {
        uint32_t current_size = (i + block_size) > (h * w) ? (h * w - i) : block_size;

        // 加载BGR数据
        DataCopy(input_local, input_gm + i * 3, current_size * 3);

        // 交换通道: BGR -> RGB
        for (uint32_t j = 0; j < current_size; j++) {
            uint8_t b = input_local.GetValue(j * 3);
            uint8_t g = input_local.GetValue(j * 3 + 1);
            uint8_t r = input_local.GetValue(j * 3 + 2);

            output_local.SetValue(j * 3, r);
            output_local.SetValue(j * 3 + 1, g);
            output_local.SetValue(j * 3 + 2, b);
        }

        // 写回RGB数据
        DataCopy(output_gm + i * 3, output_local, current_size * 3);
    }

    input_local.Free();
    output_local.Free();
}

六、几何变换示例

6.1 图像翻转(Flip)

extern "C" __global__ __aicore__ void image_flip(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [H, W, C]
    uint32_t h,
    uint32_t w,
    uint32_t channels,
    FlipMode mode          // HORIZONTAL, VERTICAL, BOTH
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local;
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的翻转算子
    Flip<uint8_t> flip_op;
    flip_op.Init(h, w, channels, mode);
    flip_op.Compute(input_gm, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, h * w * channels);

    input_local.Free();
    output_local.Free();
}

6.2 图像旋转(Rotate)

extern "C" __global__ __aicore__ void image_rotate(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [H, W, C]
    uint32_t h,
    uint32_t w,
    uint32_t channels,
    float angle,           // 旋转角度(度)
    float center_x,        // 旋转中心X
    float center_y,        // 旋转中心Y
    uint8_t fill_value     // 填充值
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的旋转算子
    Rotate<uint8_t> rotate_op;
    rotate_op.Init(h, w, channels, angle, center_x, center_y, fill_value);
    rotate_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, h * w * channels);

    input_local.Free();
    output_local.Free();
}

6.3 仿射变换(Affine)

extern "C" __global__ __aicore__ void affine_transform(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [new_h, new_w, C]
    GM_ADDR matrix_gm,     // 仿射变换矩阵: [2, 3]
    uint32_t src_h,
    uint32_t src_w,
    uint32_t dst_h,
    uint32_t dst_w,
    uint32_t channels,
    uint8_t fill_value
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);
    Tensor matrix = Tensor(matrix_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
    LocalTensor<float> matrix_local = matrix.In<float>();

    // 使用ops-cv提供的仿射变换算子
    Affine<uint8_t> affine_op;
    affine_op.Init(src_h, src_w, dst_h, dst_w, channels, fill_value);
    affine_op.SetMatrix(matrix_local);
    affine_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, dst_h * dst_w * channels);

    input_local.Free();
    output_local.Free();
    matrix_local.Free();
}

七、目标检测相关算子示例

7.1 非极大值抑制(NMS)

NMS是目标检测中用于去除重复框的关键算法:

struct Box {
    float x1, y1, x2, y2;
    float score;
    int class_id;
};

extern "C" __global__ __aicore__ void nms(
    GM_ADDR boxes_gm,      // 输入框: [N, 6] (x1, y1, x2, y2, score, class_id)
    GM_ADDR output_gm,     // 输出框索引: [M]
    GM_ADDR num_output_gm, // 输出框数量
    uint32_t num_boxes,
    float iou_threshold,
    float score_threshold
)
{
    Tensor boxes = Tensor(boxes_gm);
    Tensor output = Tensor(output_gm);
    Tensor num_output = Tensor(num_output_gm);

    LocalTensor<Box> boxes_local = boxes.In<Box>();
    LocalTensor<int32_t> output_local = output.Out<int32_t>();
    LocalTensor<uint8_t> suppressed_local;  // 抑制标记

    // 初始化抑制标记
    Duplicate<uint8_t, uint8_t> dup;
    dup.Compute(0, suppressed_local, num_boxes);

    // 按score降序排序
    Sort<Box, float> sort_op;
    sort_op.SortByScore(boxes_local, num_boxes, true);

    // 执行NMS
    int32_t keep_count = 0;
    for (uint32_t i = 0; i < num_boxes; i++) {
        if (suppressed_local.GetValue(i) == 1) continue;

        Box box_i = boxes_local.GetValue(i);
        if (box_i.score < score_threshold) continue;

        // 保留当前框
        output_local.SetValue(keep_count++, i);

        // 计算与后续框的IoU,抑制高IoU的框
        for (uint32_t j = i + 1; j < num_boxes; j++) {
            if (suppressed_local.GetValue(j) == 1) continue;

            Box box_j = boxes_local.GetValue(j);

            // 只处理同一类的框
            if (box_i.class_id != box_j.class_id) continue;

            // 计算IoU
            float inter_x1 = fmax(box_i.x1, box_j.x1);
            float inter_y1 = fmax(box_i.y1, box_j.y1);
            float inter_x2 = fmin(box_i.x2, box_j.x2);
            float inter_y2 = fmin(box_i.y2, box_j.y2);

            float inter_area = fmax(0.0f, inter_x2 - inter_x1) * fmax(0.0f, inter_y2 - inter_y1);
            float box_i_area = (box_i.x2 - box_i.x1) * (box_i.y2 - box_i.y1);
            float box_j_area = (box_j.x2 - box_j.x1) * (box_j.y2 - box_j.y1);
            float union_area = box_i_area + box_j_area - inter_area;

            float iou = (union_area > 0) ? (inter_area / union_area) : 0.0f;

            if (iou > iou_threshold) {
                suppressed_local.SetValue(j, 1);
            }
        }
    }

    // 写回结果
    DataCopy(output_gm, output_local, keep_count);
    memcpy(num_output_gm, &keep_count, sizeof(int32_t));

    boxes_local.Free();
    output_local.Free();
    suppressed_local.Free();
}

7.2 ROI Align

ROI Align是实例分割和目标检测中的关键算子:

extern "C" __global__ __aicore__ void roi_align(
    GM_ADDR feature_gm,    // 输入特征图: [N, C, H, W]
    GM_ADDR rois_gm,       // ROI框: [M, 5] (batch_idx, x1, y1, x2, y2)
    GM_ADDR output_gm,     // 输出: [M, C, pooled_h, pooled_w]
    uint32_t batch,
    uint32_t channels,
    uint32_t feature_h,
    uint32_t feature_w,
    uint32_t num_rois,
    uint32_t pooled_h,
    uint32_t pooled_w,
    float spatial_scale,
    uint32_t sampling_ratio
)
{
    Tensor feature = Tensor(feature_gm);
    Tensor rois = Tensor(rois_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<float> feature_local = feature.In<float>();
    LocalTensor<float> rois_local = rois.In<float>();
    LocalTensor<float> output_local = output.Out<float>();

    // 使用ops-cv提供的ROI Align算子
    ROIAlign<float> roi_align_op;
    roi_align_op.Init(batch, channels, feature_h, feature_w,
                     pooled_h, pooled_w, spatial_scale, sampling_ratio);

    // 对每个ROI执行操作
    for (uint32_t roi_idx = 0; roi_idx < num_rois; roi_idx++) {
        // 获取ROI信息
        uint32_t batch_idx = static_cast<uint32_t>(rois_local.GetValue(roi_idx * 5));
        float x1 = rois_local.GetValue(roi_idx * 5 + 1);
        float y1 = rois_local.GetValue(roi_idx * 5 + 2);
        float x2 = rois_local.GetValue(roi_idx * 5 + 3);
        float y2 = rois_local.GetValue(roi_idx * 5 + 4);

        // 执行ROI Align
        roi_align_op.Compute(feature_local, batch_idx, x1, y1, x2, y2,
                           output_local, roi_idx);
    }

    // 写回结果
    DataCopy(output_gm, output_local, num_rois * channels * pooled_h * pooled_w);

    feature_local.Free();
    rois_local.Free();
    output_local.Free();
}

八、图像滤波示例

8.1 高斯模糊(Gaussian Blur)

extern "C" __global__ __aicore__ void gaussian_blur(
    GM_ADDR input_gm,      // 输入图像: [H, W, C]
    GM_ADDR output_gm,     // 输出图像: [H, W, C]
    GM_ADDR kernel_gm,     // 高斯核
    uint32_t h,
    uint32_t w,
    uint32_t channels,
    uint32_t kernel_size,
    float sigma
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);
    Tensor kernel = Tensor(kernel_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
    LocalTensor<float> kernel_local = kernel.In<float>();

    // 使用ops-cv提供的高斯模糊算子
    GaussianBlur<uint8_t> blur_op;
    blur_op.Init(h, w, channels, kernel_size, sigma);
    blur_op.SetKernel(kernel_local);
    blur_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, h * w * channels);

    input_local.Free();
    output_local.Free();
    kernel_local.Free();
}

8.2 Sobel边缘检测

extern "C" __global__ __aicore__ void sobel_edge_detection(
    GM_ADDR input_gm,      // 输入灰度图: [H, W]
    GM_ADDR output_gm,     // 输出梯度: [H, W]
    uint32_t h,
    uint32_t w
)
{
    Tensor input = Tensor(input_gm);
    Tensor output = Tensor(output_gm);

    LocalTensor<uint8_t> input_local = input.In<uint8_t>();
    LocalTensor<uint8_t> output_local = output.Out<uint8_t>();

    // 使用ops-cv提供的Sobel算子
    Sobel<uint8_t> sobel_op;
    sobel_op.Init(h, w);
    sobel_op.Compute(input_local, output_local);

    // 写回结果
    DataCopy(output_gm, output_local, h * w);

    input_local.Free();
    output_local.Free();
}

九、性能优化建议

  1. 合理选择插值算法:在Resize时,Bilinear通常是最优选择
  2. 批量处理:对于多张图像,尽量使用批量处理以提高效率
  3. 数据对齐:确保数据起始地址满足对齐要求(如512字节对齐)
  4. 原地计算:对于支持原地计算的算子,优先使用以减少内存拷贝
  5. 使用零拷贝:在数据传递时尽量使用指针而非数据拷贝

十、应用场景

ops-cv算子库广泛应用于以下场景:

场景 描述 推荐算子
图像分类 预处理和后处理 Resize, Normalize, Crop
目标检测 检测框处理 NMS, ROIAlign, Resize
语义分割 图像预处理 Resize, Normalize, Pad
实例分割 ROI特征提取 ROIAlign, Mask
图像增强 数据增强 Flip, Rotate, Affine, ColorJitter
OCR 文本图像处理 Gray, Threshold, Morphology

十一、总结

ops-cv作为CANN生态系统中专门针对计算机视觉任务的算子库,为图像处理和目标检测应用提供了丰富的算子支持。通过本文的介绍和示例代码,开发者可以全面了解ops-cv的功能特性,并掌握其在实际项目中的应用方法。无论是基础的图像预处理还是复杂的目标检测后处理,ops-cv都提供了经过深度优化的高性能实现。

相关链接:

  • CANN组织链接:https://atomgit.com/cann
  • ops-cv仓库链接:https://atomgit.com/cann/ops-cv
Logo

昇腾计算产业是基于昇腾系列(HUAWEI Ascend)处理器和基础软件构建的全栈 AI计算基础设施、行业应用及服务,https://devpress.csdn.net/organization/setting/general/146749包括昇腾系列处理器、系列硬件、CANN、AI计算框架、应用使能、开发工具链、管理运维工具、行业应用及服务等全产业链

更多推荐